aboutsummaryrefslogtreecommitdiffstats
path: root/test/CodeGen/AArch64
diff options
context:
space:
mode:
Diffstat (limited to 'test/CodeGen/AArch64')
-rw-r--r--test/CodeGen/AArch64/128bit_load_store.ll20
-rw-r--r--test/CodeGen/AArch64/aarch64-neon-v1i1-setcc.ll (renamed from test/CodeGen/AArch64/neon-v1i1-setcc.ll)13
-rw-r--r--test/CodeGen/AArch64/adc.ll4
-rw-r--r--test/CodeGen/AArch64/addsub-shifted.ll18
-rw-r--r--test/CodeGen/AArch64/addsub.ll22
-rw-r--r--test/CodeGen/AArch64/addsub_ext.ll8
-rw-r--r--test/CodeGen/AArch64/alloca.ll121
-rw-r--r--test/CodeGen/AArch64/analyze-branch.ll4
-rw-r--r--test/CodeGen/AArch64/arm64-2011-03-09-CPSRSpill.ll47
-rw-r--r--test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll45
-rw-r--r--test/CodeGen/AArch64/arm64-2011-03-21-Unaligned-Frame-Index.ll12
-rw-r--r--test/CodeGen/AArch64/arm64-2011-04-21-CPSRBug.ll26
-rw-r--r--test/CodeGen/AArch64/arm64-2011-10-18-LdStOptBug.ll31
-rw-r--r--test/CodeGen/AArch64/arm64-2012-01-11-ComparisonDAGCrash.ll40
-rw-r--r--test/CodeGen/AArch64/arm64-2012-05-07-DAGCombineVectorExtract.ll20
-rw-r--r--test/CodeGen/AArch64/arm64-2012-05-07-MemcpyAlignBug.ll21
-rw-r--r--test/CodeGen/AArch64/arm64-2012-05-09-LOADgot-bug.ll22
-rw-r--r--test/CodeGen/AArch64/arm64-2012-05-22-LdStOptBug.ll50
-rw-r--r--test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll67
-rw-r--r--test/CodeGen/AArch64/arm64-2012-07-11-InstrEmitterBug.ll56
-rw-r--r--test/CodeGen/AArch64/arm64-2013-01-13-ffast-fcmp.ll19
-rw-r--r--test/CodeGen/AArch64/arm64-2013-01-23-frem-crash.ll15
-rw-r--r--test/CodeGen/AArch64/arm64-2013-01-23-sext-crash.ll37
-rw-r--r--test/CodeGen/AArch64/arm64-2013-02-12-shufv8i8.ll11
-rw-r--r--test/CodeGen/AArch64/arm64-2014-04-16-AnInfiniteLoopInDAGCombine.ll23
-rw-r--r--test/CodeGen/AArch64/arm64-2014-04-28-sqshl-uqshl-i64Contant.ll19
-rw-r--r--test/CodeGen/AArch64/arm64-2014-04-29-EXT-undef-mask.ll23
-rw-r--r--test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll67
-rw-r--r--test/CodeGen/AArch64/arm64-aapcs.ll103
-rw-r--r--test/CodeGen/AArch64/arm64-abi-varargs.ll191
-rw-r--r--test/CodeGen/AArch64/arm64-abi.ll238
-rw-r--r--test/CodeGen/AArch64/arm64-abi_align.ll532
-rw-r--r--test/CodeGen/AArch64/arm64-addp.ll32
-rw-r--r--test/CodeGen/AArch64/arm64-addr-mode-folding.ll171
-rw-r--r--test/CodeGen/AArch64/arm64-addr-type-promotion.ll82
-rw-r--r--test/CodeGen/AArch64/arm64-addrmode.ll72
-rw-r--r--test/CodeGen/AArch64/arm64-alloc-no-stack-realign.ll21
-rw-r--r--test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll29
-rw-r--r--test/CodeGen/AArch64/arm64-andCmpBrToTBZ.ll72
-rw-r--r--test/CodeGen/AArch64/arm64-ands-bad-peephole.ll31
-rw-r--r--test/CodeGen/AArch64/arm64-anyregcc-crash.ll19
-rw-r--r--test/CodeGen/AArch64/arm64-anyregcc.ll363
-rw-r--r--test/CodeGen/AArch64/arm64-arith-saturating.ll153
-rw-r--r--test/CodeGen/AArch64/arm64-arith.ll262
-rw-r--r--test/CodeGen/AArch64/arm64-arm64-dead-def-elimination-flag.ll16
-rw-r--r--test/CodeGen/AArch64/arm64-atomic-128.ll225
-rw-r--r--test/CodeGen/AArch64/arm64-atomic.ll331
-rw-r--r--test/CodeGen/AArch64/arm64-basic-pic.ll54
-rw-r--r--test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll1101
-rw-r--r--test/CodeGen/AArch64/arm64-big-endian-eh.ll73
-rw-r--r--test/CodeGen/AArch64/arm64-big-endian-varargs.ll58
-rw-r--r--test/CodeGen/AArch64/arm64-big-endian-vector-callee.ll848
-rw-r--r--test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll1100
-rw-r--r--test/CodeGen/AArch64/arm64-big-imm-offsets.ll14
-rw-r--r--test/CodeGen/AArch64/arm64-big-stack.ll21
-rw-r--r--test/CodeGen/AArch64/arm64-bitfield-extract.ll532
-rw-r--r--test/CodeGen/AArch64/arm64-blockaddress.ll30
-rw-r--r--test/CodeGen/AArch64/arm64-build-vector.ll35
-rw-r--r--test/CodeGen/AArch64/arm64-call-tailcalls.ll91
-rw-r--r--test/CodeGen/AArch64/arm64-cast-opt.ll31
-rw-r--r--test/CodeGen/AArch64/arm64-ccmp-heuristics.ll190
-rw-r--r--test/CodeGen/AArch64/arm64-ccmp.ll289
-rw-r--r--test/CodeGen/AArch64/arm64-clrsb.ll36
-rw-r--r--test/CodeGen/AArch64/arm64-coalesce-ext.ll17
-rw-r--r--test/CodeGen/AArch64/arm64-code-model-large-abs.ll72
-rw-r--r--test/CodeGen/AArch64/arm64-collect-loh-garbage-crash.ll37
-rw-r--r--test/CodeGen/AArch64/arm64-collect-loh-str.ll23
-rw-r--r--test/CodeGen/AArch64/arm64-collect-loh.ll53
-rw-r--r--test/CodeGen/AArch64/arm64-complex-copy-noneon.ll21
-rw-r--r--test/CodeGen/AArch64/arm64-complex-ret.ll7
-rw-r--r--test/CodeGen/AArch64/arm64-const-addr.ll23
-rw-r--r--test/CodeGen/AArch64/arm64-convert-v2f64-v2i32.ll24
-rw-r--r--test/CodeGen/AArch64/arm64-convert-v2i32-v2f64.ll29
-rw-r--r--test/CodeGen/AArch64/arm64-copy-tuple.ll146
-rw-r--r--test/CodeGen/AArch64/arm64-crc32.ll71
-rw-r--r--test/CodeGen/AArch64/arm64-crypto.ll135
-rw-r--r--test/CodeGen/AArch64/arm64-cse.ll59
-rw-r--r--test/CodeGen/AArch64/arm64-csel.ll230
-rw-r--r--test/CodeGen/AArch64/arm64-cvt.ll401
-rw-r--r--test/CodeGen/AArch64/arm64-dagcombiner-convergence.ll19
-rw-r--r--test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll29
-rw-r--r--test/CodeGen/AArch64/arm64-dagcombiner-indexed-load.ll46
-rw-r--r--test/CodeGen/AArch64/arm64-dagcombiner-load-slicing.ll102
-rw-r--r--test/CodeGen/AArch64/arm64-dead-def-frame-index.ll18
-rw-r--r--test/CodeGen/AArch64/arm64-dead-register-def-bug.ll32
-rw-r--r--test/CodeGen/AArch64/arm64-dup.ll323
-rw-r--r--test/CodeGen/AArch64/arm64-early-ifcvt.ll423
-rw-r--r--test/CodeGen/AArch64/arm64-elf-calls.ll20
-rw-r--r--test/CodeGen/AArch64/arm64-elf-constpool.ll13
-rw-r--r--test/CodeGen/AArch64/arm64-elf-globals.ll115
-rw-r--r--test/CodeGen/AArch64/arm64-ext.ll118
-rw-r--r--test/CodeGen/AArch64/arm64-extend-int-to-fp.ll19
-rw-r--r--test/CodeGen/AArch64/arm64-extend.ll15
-rw-r--r--test/CodeGen/AArch64/arm64-extern-weak.ll51
-rw-r--r--test/CodeGen/AArch64/arm64-extload-knownzero.ll28
-rw-r--r--test/CodeGen/AArch64/arm64-extract.ll58
-rw-r--r--test/CodeGen/AArch64/arm64-extract_subvector.ll51
-rw-r--r--test/CodeGen/AArch64/arm64-fast-isel-addr-offset.ll47
-rw-r--r--test/CodeGen/AArch64/arm64-fast-isel-alloca.ll25
-rw-r--r--test/CodeGen/AArch64/arm64-fast-isel-br.ll155
-rw-r--r--test/CodeGen/AArch64/arm64-fast-isel-call.ll100
-rw-r--r--test/CodeGen/AArch64/arm64-fast-isel-conversion.ll442
-rw-r--r--test/CodeGen/AArch64/arm64-fast-isel-fcmp.ll146
-rw-r--r--test/CodeGen/AArch64/arm64-fast-isel-gv.ll38
-rw-r--r--test/CodeGen/AArch64/arm64-fast-isel-icmp.ll214
-rw-r--r--test/CodeGen/AArch64/arm64-fast-isel-indirectbr.ll36
-rw-r--r--test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll135
-rw-r--r--test/CodeGen/AArch64/arm64-fast-isel-materialize.ll27
-rw-r--r--test/CodeGen/AArch64/arm64-fast-isel-noconvert.ll68
-rw-r--r--test/CodeGen/AArch64/arm64-fast-isel-rem.ll44
-rw-r--r--test/CodeGen/AArch64/arm64-fast-isel-ret.ll63
-rw-r--r--test/CodeGen/AArch64/arm64-fast-isel-select.ll63
-rw-r--r--test/CodeGen/AArch64/arm64-fast-isel.ll95
-rw-r--r--test/CodeGen/AArch64/arm64-fastcc-tailcall.ll24
-rw-r--r--test/CodeGen/AArch64/arm64-fastisel-gep-promote-before-add.ll18
-rw-r--r--test/CodeGen/AArch64/arm64-fcmp-opt.ll204
-rw-r--r--test/CodeGen/AArch64/arm64-fcopysign.ll51
-rw-r--r--test/CodeGen/AArch64/arm64-fixed-point-scalar-cvt-dagcombine.ll15
-rw-r--r--test/CodeGen/AArch64/arm64-fmadd.ll92
-rw-r--r--test/CodeGen/AArch64/arm64-fmax.ll34
-rw-r--r--test/CodeGen/AArch64/arm64-fminv.ll101
-rw-r--r--test/CodeGen/AArch64/arm64-fmuladd.ll88
-rw-r--r--test/CodeGen/AArch64/arm64-fold-address.ll79
-rw-r--r--test/CodeGen/AArch64/arm64-fold-lsl.ll79
-rw-r--r--test/CodeGen/AArch64/arm64-fp-contract-zero.ll14
-rw-r--r--test/CodeGen/AArch64/arm64-fp-imm.ll32
-rw-r--r--test/CodeGen/AArch64/arm64-fp.ll8
-rw-r--r--test/CodeGen/AArch64/arm64-fp128-folding.ll17
-rw-r--r--test/CodeGen/AArch64/arm64-fp128.ll (renamed from test/CodeGen/AArch64/fp128.ll)130
-rw-r--r--test/CodeGen/AArch64/arm64-frame-index.ll11
-rw-r--r--test/CodeGen/AArch64/arm64-frameaddr.ll15
-rw-r--r--test/CodeGen/AArch64/arm64-global-address.ll14
-rw-r--r--test/CodeGen/AArch64/arm64-hello.ll38
-rw-r--r--test/CodeGen/AArch64/arm64-i16-subreg-extract.ll12
-rw-r--r--test/CodeGen/AArch64/arm64-icmp-opt.ll17
-rw-r--r--test/CodeGen/AArch64/arm64-illegal-float-ops.ll295
-rw-r--r--test/CodeGen/AArch64/arm64-indexed-memory.ll351
-rw-r--r--test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll40
-rw-r--r--test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll6174
-rw-r--r--test/CodeGen/AArch64/arm64-inline-asm-error-I.ll11
-rw-r--r--test/CodeGen/AArch64/arm64-inline-asm-error-J.ll11
-rw-r--r--test/CodeGen/AArch64/arm64-inline-asm-error-K.ll11
-rw-r--r--test/CodeGen/AArch64/arm64-inline-asm-error-L.ll11
-rw-r--r--test/CodeGen/AArch64/arm64-inline-asm-error-M.ll11
-rw-r--r--test/CodeGen/AArch64/arm64-inline-asm-error-N.ll11
-rw-r--r--test/CodeGen/AArch64/arm64-inline-asm-zero-reg-error.ll11
-rw-r--r--test/CodeGen/AArch64/arm64-inline-asm.ll230
-rw-r--r--test/CodeGen/AArch64/arm64-join-reserved.ll17
-rw-r--r--test/CodeGen/AArch64/arm64-jumptable.ll35
-rw-r--r--test/CodeGen/AArch64/arm64-large-frame.ll69
-rw-r--r--test/CodeGen/AArch64/arm64-ld1.ll1345
-rw-r--r--test/CodeGen/AArch64/arm64-ldp.ll149
-rw-r--r--test/CodeGen/AArch64/arm64-ldur.ll67
-rw-r--r--test/CodeGen/AArch64/arm64-ldxr-stxr.ll270
-rw-r--r--test/CodeGen/AArch64/arm64-leaf.ll13
-rw-r--r--test/CodeGen/AArch64/arm64-long-shift.ll59
-rw-r--r--test/CodeGen/AArch64/arm64-memcpy-inline.ll112
-rw-r--r--test/CodeGen/AArch64/arm64-memset-inline.ll27
-rw-r--r--test/CodeGen/AArch64/arm64-memset-to-bzero.ll108
-rw-r--r--test/CodeGen/AArch64/arm64-misched-basic-A53.ll (renamed from test/CodeGen/AArch64/misched-basic-A53.ll)28
-rw-r--r--test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll21
-rw-r--r--test/CodeGen/AArch64/arm64-movi.ll202
-rw-r--r--test/CodeGen/AArch64/arm64-mul.ll90
-rw-r--r--test/CodeGen/AArch64/arm64-named-reg-alloc.ll14
-rw-r--r--test/CodeGen/AArch64/arm64-named-reg-notareg.ll13
-rw-r--r--test/CodeGen/AArch64/arm64-neg.ll71
-rw-r--r--test/CodeGen/AArch64/arm64-neon-2velem-high.ll (renamed from test/CodeGen/AArch64/neon-2velem-high.ll)162
-rw-r--r--test/CodeGen/AArch64/arm64-neon-2velem.ll (renamed from test/CodeGen/AArch64/neon-2velem.ll)906
-rw-r--r--test/CodeGen/AArch64/arm64-neon-3vdiff.ll (renamed from test/CodeGen/AArch64/neon-3vdiff.ll)618
-rw-r--r--test/CodeGen/AArch64/arm64-neon-aba-abd.ll (renamed from test/CodeGen/AArch64/neon-aba-abd.ll)90
-rw-r--r--test/CodeGen/AArch64/arm64-neon-across.ll460
-rw-r--r--test/CodeGen/AArch64/arm64-neon-add-pairwise.ll100
-rw-r--r--test/CodeGen/AArch64/arm64-neon-add-sub.ll (renamed from test/CodeGen/AArch64/neon-add-sub.ll)64
-rw-r--r--test/CodeGen/AArch64/arm64-neon-compare-instructions.ll1191
-rw-r--r--test/CodeGen/AArch64/arm64-neon-copy.ll (renamed from test/CodeGen/AArch64/neon-copy.ll)379
-rw-r--r--test/CodeGen/AArch64/arm64-neon-copyPhysReg-tuple.ll48
-rw-r--r--test/CodeGen/AArch64/arm64-neon-mul-div.ll797
-rw-r--r--test/CodeGen/AArch64/arm64-neon-scalar-by-elem-mul.ll (renamed from test/CodeGen/AArch64/neon-scalar-by-elem-mul.ll)52
-rw-r--r--test/CodeGen/AArch64/arm64-neon-select_cc.ll (renamed from test/CodeGen/AArch64/neon-select_cc.ll)144
-rw-r--r--test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll482
-rw-r--r--test/CodeGen/AArch64/arm64-neon-simd-shift.ll663
-rw-r--r--test/CodeGen/AArch64/arm64-neon-simd-vget.ll (renamed from test/CodeGen/AArch64/neon-simd-vget.ll)30
-rw-r--r--test/CodeGen/AArch64/arm64-neon-v1i1-setcc.ll74
-rw-r--r--test/CodeGen/AArch64/arm64-neon-vector-list-spill.ll (renamed from test/CodeGen/AArch64/neon-vector-list-spill.ll)90
-rw-r--r--test/CodeGen/AArch64/arm64-patchpoint.ll171
-rw-r--r--test/CodeGen/AArch64/arm64-pic-local-symbol.ll22
-rw-r--r--test/CodeGen/AArch64/arm64-platform-reg.ll26
-rw-r--r--test/CodeGen/AArch64/arm64-popcnt.ll43
-rw-r--r--test/CodeGen/AArch64/arm64-prefetch.ll88
-rw-r--r--test/CodeGen/AArch64/arm64-promote-const.ll255
-rw-r--r--test/CodeGen/AArch64/arm64-redzone.ll18
-rw-r--r--test/CodeGen/AArch64/arm64-reg-copy-noneon.ll20
-rw-r--r--test/CodeGen/AArch64/arm64-register-offset-addressing.ll145
-rw-r--r--test/CodeGen/AArch64/arm64-register-pairing.ll53
-rw-r--r--test/CodeGen/AArch64/arm64-regress-f128csel-flags.ll27
-rw-r--r--test/CodeGen/AArch64/arm64-regress-interphase-shift.ll33
-rw-r--r--test/CodeGen/AArch64/arm64-return-vector.ll11
-rw-r--r--test/CodeGen/AArch64/arm64-returnaddr.ll26
-rw-r--r--test/CodeGen/AArch64/arm64-rev.ll235
-rw-r--r--test/CodeGen/AArch64/arm64-rounding.ll208
-rw-r--r--test/CodeGen/AArch64/arm64-scaled_iv.ll38
-rw-r--r--test/CodeGen/AArch64/arm64-scvt.ll830
-rw-r--r--test/CodeGen/AArch64/arm64-shifted-sext.ll277
-rw-r--r--test/CodeGen/AArch64/arm64-simd-scalar-to-vector.ll22
-rw-r--r--test/CodeGen/AArch64/arm64-simplest-elf.ll18
-rw-r--r--test/CodeGen/AArch64/arm64-sincos.ll42
-rw-r--r--test/CodeGen/AArch64/arm64-sitofp-combine-chains.ll22
-rw-r--r--test/CodeGen/AArch64/arm64-sli-sri-opt.ll41
-rw-r--r--test/CodeGen/AArch64/arm64-smaxv.ll74
-rw-r--r--test/CodeGen/AArch64/arm64-sminv.ll74
-rw-r--r--test/CodeGen/AArch64/arm64-spill-lr.ll74
-rw-r--r--test/CodeGen/AArch64/arm64-spill.ll15
-rw-r--r--test/CodeGen/AArch64/arm64-st1.ll676
-rw-r--r--test/CodeGen/AArch64/arm64-stack-no-frame.ll20
-rw-r--r--test/CodeGen/AArch64/arm64-stackmap.ll288
-rw-r--r--test/CodeGen/AArch64/arm64-stackpointer.ll24
-rw-r--r--test/CodeGen/AArch64/arm64-stacksave.ll20
-rw-r--r--test/CodeGen/AArch64/arm64-stp.ll101
-rw-r--r--test/CodeGen/AArch64/arm64-strict-align.ll26
-rw-r--r--test/CodeGen/AArch64/arm64-stur.ll98
-rw-r--r--test/CodeGen/AArch64/arm64-subsections.ll5
-rw-r--r--test/CodeGen/AArch64/arm64-subvector-extend.ll141
-rw-r--r--test/CodeGen/AArch64/arm64-swizzle-tbl-i16-layout.ll36
-rw-r--r--test/CodeGen/AArch64/arm64-tbl.ll132
-rw-r--r--test/CodeGen/AArch64/arm64-this-return.ll83
-rw-r--r--test/CodeGen/AArch64/arm64-tls-darwin.ll18
-rw-r--r--test/CodeGen/AArch64/arm64-tls-dynamic-together.ll (renamed from test/CodeGen/AArch64/tls-dynamic-together.ll)2
-rw-r--r--test/CodeGen/AArch64/arm64-tls-dynamics.ll (renamed from test/CodeGen/AArch64/tls-dynamics.ll)64
-rw-r--r--test/CodeGen/AArch64/arm64-tls-execs.ll (renamed from test/CodeGen/AArch64/tls-execs.ll)20
-rw-r--r--test/CodeGen/AArch64/arm64-trap.ll8
-rw-r--r--test/CodeGen/AArch64/arm64-trn.ll134
-rw-r--r--test/CodeGen/AArch64/arm64-trunc-store.ll75
-rw-r--r--test/CodeGen/AArch64/arm64-umaxv.ll92
-rw-r--r--test/CodeGen/AArch64/arm64-uminv.ll92
-rw-r--r--test/CodeGen/AArch64/arm64-umov.ll33
-rw-r--r--test/CodeGen/AArch64/arm64-unaligned_ldst.ll41
-rw-r--r--test/CodeGen/AArch64/arm64-uzp.ll107
-rw-r--r--test/CodeGen/AArch64/arm64-vaargs.ll20
-rw-r--r--test/CodeGen/AArch64/arm64-vabs.ll804
-rw-r--r--test/CodeGen/AArch64/arm64-vadd.ll941
-rw-r--r--test/CodeGen/AArch64/arm64-vaddlv.ll26
-rw-r--r--test/CodeGen/AArch64/arm64-vaddv.ll245
-rw-r--r--test/CodeGen/AArch64/arm64-variadic-aapcs.ll143
-rw-r--r--test/CodeGen/AArch64/arm64-vbitwise.ll91
-rw-r--r--test/CodeGen/AArch64/arm64-vclz.ll109
-rw-r--r--test/CodeGen/AArch64/arm64-vcmp.ll236
-rw-r--r--test/CodeGen/AArch64/arm64-vcnt.ll56
-rw-r--r--test/CodeGen/AArch64/arm64-vcombine.ll17
-rw-r--r--test/CodeGen/AArch64/arm64-vcvt.ll686
-rw-r--r--test/CodeGen/AArch64/arm64-vcvt_f.ll82
-rw-r--r--test/CodeGen/AArch64/arm64-vcvt_f32_su32.ll73
-rw-r--r--test/CodeGen/AArch64/arm64-vcvt_n.ll49
-rw-r--r--test/CodeGen/AArch64/arm64-vcvt_su32_f32.ll34
-rw-r--r--test/CodeGen/AArch64/arm64-vcvtxd_f32_f64.ll11
-rw-r--r--test/CodeGen/AArch64/arm64-vecCmpBr.ll207
-rw-r--r--test/CodeGen/AArch64/arm64-vecFold.ll145
-rw-r--r--test/CodeGen/AArch64/arm64-vector-ext.ll16
-rw-r--r--test/CodeGen/AArch64/arm64-vector-imm.ll134
-rw-r--r--test/CodeGen/AArch64/arm64-vector-insertion.ll33
-rw-r--r--test/CodeGen/AArch64/arm64-vector-ldst.ll601
-rw-r--r--test/CodeGen/AArch64/arm64-vext.ll464
-rw-r--r--test/CodeGen/AArch64/arm64-vext_reverse.ll172
-rw-r--r--test/CodeGen/AArch64/arm64-vfloatintrinsics.ll375
-rw-r--r--test/CodeGen/AArch64/arm64-vhadd.ll249
-rw-r--r--test/CodeGen/AArch64/arm64-vhsub.ll125
-rw-r--r--test/CodeGen/AArch64/arm64-virtual_base.ll51
-rw-r--r--test/CodeGen/AArch64/arm64-vmax.ll679
-rw-r--r--test/CodeGen/AArch64/arm64-vminmaxnm.ll68
-rw-r--r--test/CodeGen/AArch64/arm64-vmovn.ll242
-rw-r--r--test/CodeGen/AArch64/arm64-vmul.ll2036
-rw-r--r--test/CodeGen/AArch64/arm64-volatile.ll27
-rw-r--r--test/CodeGen/AArch64/arm64-vpopcnt.ll68
-rw-r--r--test/CodeGen/AArch64/arm64-vqadd.ll332
-rw-r--r--test/CodeGen/AArch64/arm64-vqsub.ll147
-rw-r--r--test/CodeGen/AArch64/arm64-vselect.ll25
-rw-r--r--test/CodeGen/AArch64/arm64-vsetcc_fp.ll11
-rw-r--r--test/CodeGen/AArch64/arm64-vshift.ll1917
-rw-r--r--test/CodeGen/AArch64/arm64-vshr.ll63
-rw-r--r--test/CodeGen/AArch64/arm64-vshuffle.ll115
-rw-r--r--test/CodeGen/AArch64/arm64-vsqrt.ll232
-rw-r--r--test/CodeGen/AArch64/arm64-vsra.ll150
-rw-r--r--test/CodeGen/AArch64/arm64-vsub.ll417
-rw-r--r--test/CodeGen/AArch64/arm64-weak-reference.ll10
-rw-r--r--test/CodeGen/AArch64/arm64-xaluo.ll524
-rw-r--r--test/CodeGen/AArch64/arm64-zero-cycle-regmov.ll17
-rw-r--r--test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll49
-rw-r--r--test/CodeGen/AArch64/arm64-zext.ll11
-rw-r--r--test/CodeGen/AArch64/arm64-zextload-unscaled.ll40
-rw-r--r--test/CodeGen/AArch64/arm64-zip.ll107
-rw-r--r--test/CodeGen/AArch64/asm-large-immediate.ll10
-rw-r--r--test/CodeGen/AArch64/assertion-rc-mismatch.ll2
-rw-r--r--test/CodeGen/AArch64/atomic-ops.ll474
-rw-r--r--test/CodeGen/AArch64/basic-pic.ll12
-rw-r--r--test/CodeGen/AArch64/bitfield-insert-0.ll2
-rw-r--r--test/CodeGen/AArch64/bitfield-insert.ll25
-rw-r--r--test/CodeGen/AArch64/bitfield.ll10
-rw-r--r--test/CodeGen/AArch64/blockaddress.ll2
-rw-r--r--test/CodeGen/AArch64/bool-loads.ll22
-rw-r--r--test/CodeGen/AArch64/breg.ll4
-rw-r--r--test/CodeGen/AArch64/callee-save.ll18
-rw-r--r--test/CodeGen/AArch64/code-model-large-abs.ll2
-rw-r--r--test/CodeGen/AArch64/compare-branch.ll2
-rw-r--r--test/CodeGen/AArch64/concatvector-bugs.ll68
-rw-r--r--test/CodeGen/AArch64/cond-sel.ll38
-rw-r--r--test/CodeGen/AArch64/cpus.ll9
-rw-r--r--test/CodeGen/AArch64/directcond.ll24
-rw-r--r--test/CodeGen/AArch64/dp-3source.ll2
-rw-r--r--test/CodeGen/AArch64/dp1.ll2
-rw-r--r--test/CodeGen/AArch64/dp2.ll24
-rw-r--r--test/CodeGen/AArch64/eliminate-trunc.ll39
-rw-r--r--test/CodeGen/AArch64/extern-weak.ll28
-rw-r--r--test/CodeGen/AArch64/extract.ll6
-rw-r--r--test/CodeGen/AArch64/fastcc-reserved.ll16
-rw-r--r--test/CodeGen/AArch64/fastcc.ll47
-rw-r--r--test/CodeGen/AArch64/fcmp.ll2
-rw-r--r--test/CodeGen/AArch64/fcvt-fixed.ll6
-rw-r--r--test/CodeGen/AArch64/fcvt-int.ll2
-rw-r--r--test/CodeGen/AArch64/flags-multiuse.ll4
-rw-r--r--test/CodeGen/AArch64/floatdp_1source.ll2
-rw-r--r--test/CodeGen/AArch64/floatdp_2source.ll2
-rw-r--r--test/CodeGen/AArch64/fp-cond-sel.ll23
-rw-r--r--test/CodeGen/AArch64/fp-dp3.ll4
-rw-r--r--test/CodeGen/AArch64/fp128-folding.ll4
-rw-r--r--test/CodeGen/AArch64/fpimm.ll6
-rw-r--r--test/CodeGen/AArch64/frameaddr.ll4
-rw-r--r--test/CodeGen/AArch64/free-zext.ll14
-rw-r--r--test/CodeGen/AArch64/func-argpassing.ll64
-rw-r--r--test/CodeGen/AArch64/func-calls.ll61
-rw-r--r--test/CodeGen/AArch64/global-alignment.ll26
-rw-r--r--test/CodeGen/AArch64/got-abuse.ll6
-rw-r--r--test/CodeGen/AArch64/i1-contents.ll55
-rw-r--r--test/CodeGen/AArch64/i128-align.ll6
-rw-r--r--test/CodeGen/AArch64/i128-shift.ll43
-rw-r--r--test/CodeGen/AArch64/illegal-float-ops.ll2
-rw-r--r--test/CodeGen/AArch64/init-array.ll4
-rw-r--r--test/CodeGen/AArch64/inline-asm-constraints-badI.ll4
-rw-r--r--test/CodeGen/AArch64/inline-asm-constraints-badK.ll2
-rw-r--r--test/CodeGen/AArch64/inline-asm-constraints-badK2.ll2
-rw-r--r--test/CodeGen/AArch64/inline-asm-constraints-badL.ll2
-rw-r--r--test/CodeGen/AArch64/inline-asm-constraints.ll137
-rw-r--r--test/CodeGen/AArch64/inline-asm-modifiers.ll147
-rw-r--r--test/CodeGen/AArch64/jump-table.ll10
-rw-r--r--test/CodeGen/AArch64/large-consts.ll9
-rw-r--r--test/CodeGen/AArch64/large-frame.ll119
-rw-r--r--test/CodeGen/AArch64/ldst-opt.ll301
-rw-r--r--test/CodeGen/AArch64/ldst-regoffset.ll80
-rw-r--r--test/CodeGen/AArch64/ldst-unscaledimm.ll6
-rw-r--r--test/CodeGen/AArch64/ldst-unsignedimm.ll62
-rw-r--r--test/CodeGen/AArch64/lit.local.cfg7
-rw-r--r--test/CodeGen/AArch64/literal_pools.ll103
-rw-r--r--test/CodeGen/AArch64/literal_pools_float.ll46
-rw-r--r--test/CodeGen/AArch64/local_vars.ll21
-rw-r--r--test/CodeGen/AArch64/logical-imm.ll2
-rw-r--r--test/CodeGen/AArch64/logical_shifted_reg.ll6
-rw-r--r--test/CodeGen/AArch64/mature-mc-support.ll8
-rw-r--r--test/CodeGen/AArch64/movw-consts.ll32
-rw-r--r--test/CodeGen/AArch64/movw-shift-encoding.ll9
-rw-r--r--test/CodeGen/AArch64/mul-lohi.ll4
-rw-r--r--test/CodeGen/AArch64/neon-across.ll472
-rw-r--r--test/CodeGen/AArch64/neon-add-pairwise.ll101
-rw-r--r--test/CodeGen/AArch64/neon-bitwise-instructions.ll519
-rw-r--r--test/CodeGen/AArch64/neon-bsl.ll235
-rw-r--r--test/CodeGen/AArch64/neon-compare-instructions.ll957
-rw-r--r--test/CodeGen/AArch64/neon-copyPhysReg-tuple.ll47
-rw-r--r--test/CodeGen/AArch64/neon-crypto.ll144
-rw-r--r--test/CodeGen/AArch64/neon-diagnostics.ll2
-rw-r--r--test/CodeGen/AArch64/neon-extract.ll106
-rw-r--r--test/CodeGen/AArch64/neon-facge-facgt.ll56
-rw-r--r--test/CodeGen/AArch64/neon-frsqrt-frecp.ll54
-rw-r--r--test/CodeGen/AArch64/neon-halving-add-sub.ll207
-rw-r--r--test/CodeGen/AArch64/neon-idiv.ll13
-rw-r--r--test/CodeGen/AArch64/neon-load-store-v1i32.ll29
-rw-r--r--test/CodeGen/AArch64/neon-max-min-pairwise.ll346
-rw-r--r--test/CodeGen/AArch64/neon-max-min.ll310
-rw-r--r--test/CodeGen/AArch64/neon-misc-scalar.ll60
-rw-r--r--test/CodeGen/AArch64/neon-misc.ll2014
-rw-r--r--test/CodeGen/AArch64/neon-mov.ll127
-rw-r--r--test/CodeGen/AArch64/neon-mul-div.ll754
-rw-r--r--test/CodeGen/AArch64/neon-perm.ll838
-rw-r--r--test/CodeGen/AArch64/neon-rounding-halving-add.ll105
-rw-r--r--test/CodeGen/AArch64/neon-rounding-shift.ll121
-rw-r--r--test/CodeGen/AArch64/neon-saturating-add-sub.ll241
-rw-r--r--test/CodeGen/AArch64/neon-saturating-rounding-shift.ll121
-rw-r--r--test/CodeGen/AArch64/neon-saturating-shift.ll121
-rw-r--r--test/CodeGen/AArch64/neon-scalar-abs.ll61
-rw-r--r--test/CodeGen/AArch64/neon-scalar-add-sub.ll50
-rw-r--r--test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll28
-rw-r--r--test/CodeGen/AArch64/neon-scalar-compare.ll343
-rw-r--r--test/CodeGen/AArch64/neon-scalar-copy.ll64
-rw-r--r--test/CodeGen/AArch64/neon-scalar-cvt.ll133
-rw-r--r--test/CodeGen/AArch64/neon-scalar-ext.ll113
-rw-r--r--test/CodeGen/AArch64/neon-scalar-extract-narrow.ll104
-rw-r--r--test/CodeGen/AArch64/neon-scalar-fabd.ll20
-rw-r--r--test/CodeGen/AArch64/neon-scalar-fcvt.ll233
-rw-r--r--test/CodeGen/AArch64/neon-scalar-fp-compare.ll282
-rw-r--r--test/CodeGen/AArch64/neon-scalar-mul.ll143
-rw-r--r--test/CodeGen/AArch64/neon-scalar-neg.ll61
-rw-r--r--test/CodeGen/AArch64/neon-scalar-recip.ll92
-rw-r--r--test/CodeGen/AArch64/neon-scalar-reduce-pairwise.ll215
-rw-r--r--test/CodeGen/AArch64/neon-scalar-rounding-shift.ll39
-rw-r--r--test/CodeGen/AArch64/neon-scalar-saturating-add-sub.ll242
-rw-r--r--test/CodeGen/AArch64/neon-scalar-saturating-rounding-shift.ll94
-rw-r--r--test/CodeGen/AArch64/neon-scalar-saturating-shift.ll88
-rw-r--r--test/CodeGen/AArch64/neon-scalar-shift-imm.ll531
-rw-r--r--test/CodeGen/AArch64/neon-scalar-shift.ll236
-rw-r--r--test/CodeGen/AArch64/neon-shift.ll171
-rw-r--r--test/CodeGen/AArch64/neon-shl-ashr-lshr.ll333
-rw-r--r--test/CodeGen/AArch64/neon-simd-ldst-multi-elem.ll2314
-rw-r--r--test/CodeGen/AArch64/neon-simd-ldst-one.ll2299
-rw-r--r--test/CodeGen/AArch64/neon-simd-ldst.ll164
-rw-r--r--test/CodeGen/AArch64/neon-simd-post-ldst-multi-elem.ll354
-rw-r--r--test/CodeGen/AArch64/neon-simd-post-ldst-one.ll319
-rw-r--r--test/CodeGen/AArch64/neon-simd-shift.ll1556
-rw-r--r--test/CodeGen/AArch64/neon-simd-tbl.ll828
-rw-r--r--test/CodeGen/AArch64/neon-spill-fpr8-fpr16.ll30
-rw-r--r--test/CodeGen/AArch64/neon-truncStore-extLoad.ll8
-rw-r--r--test/CodeGen/AArch64/nzcv-save.ll18
-rw-r--r--test/CodeGen/AArch64/pic-eh-stubs.ll6
-rw-r--r--test/CodeGen/AArch64/ragreedy-csr.ll6
-rw-r--r--test/CodeGen/AArch64/regress-bitcast-formals.ll2
-rw-r--r--test/CodeGen/AArch64/regress-f128csel-flags.ll2
-rw-r--r--test/CodeGen/AArch64/regress-fp128-livein.ll2
-rw-r--r--test/CodeGen/AArch64/regress-tail-livereg.ll2
-rw-r--r--test/CodeGen/AArch64/regress-tblgen-chains.ll13
-rw-r--r--test/CodeGen/AArch64/regress-w29-reserved-with-fp.ll15
-rw-r--r--test/CodeGen/AArch64/regress-wzr-allocatable.ll41
-rw-r--r--test/CodeGen/AArch64/returnaddr.ll2
-rw-r--r--test/CodeGen/AArch64/setcc-takes-i32.ll2
-rw-r--r--test/CodeGen/AArch64/sext_inreg.ll198
-rw-r--r--test/CodeGen/AArch64/sibling-call.ll14
-rw-r--r--test/CodeGen/AArch64/sincos-expansion.ll2
-rw-r--r--test/CodeGen/AArch64/sincospow-vector-expansion.ll2
-rw-r--r--test/CodeGen/AArch64/tail-call.ll34
-rw-r--r--test/CodeGen/AArch64/tst-br.ll12
-rw-r--r--test/CodeGen/AArch64/variadic.ll241
-rw-r--r--test/CodeGen/AArch64/zero-reg.ll9
436 files changed, 50159 insertions, 21338 deletions
diff --git a/test/CodeGen/AArch64/128bit_load_store.ll b/test/CodeGen/AArch64/128bit_load_store.ll
index 502fd70..a6f0776 100644
--- a/test/CodeGen/AArch64/128bit_load_store.ll
+++ b/test/CodeGen/AArch64/128bit_load_store.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=neon | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=neon | FileCheck %s --check-prefix=CHECK
define void @test_store_f128(fp128* %ptr, fp128 %val) #0 {
-; CHECK: test_store_f128
+; CHECK-LABEL: test_store_f128
; CHECK: str {{q[0-9]+}}, [{{x[0-9]+}}]
entry:
store fp128 %val, fp128* %ptr, align 16
@@ -9,7 +9,7 @@ entry:
}
define fp128 @test_load_f128(fp128* readonly %ptr) #2 {
-; CHECK: test_load_f128
+; CHECK-LABEL: test_load_f128
; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}]
entry:
%0 = load fp128* %ptr, align 16
@@ -17,9 +17,9 @@ entry:
}
define void @test_vstrq_p128(i128* %ptr, i128 %val) #0 {
-; CHECK: test_vstrq_p128
-; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}, #8]
-; CHECK-NEXT: str {{x[0-9]+}}, [{{x[0-9]+}}]
+; CHECK-LABEL: test_vstrq_p128
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [{{x[0-9]+}}]
+
entry:
%0 = bitcast i128* %ptr to fp128*
%1 = bitcast i128 %val to fp128
@@ -28,9 +28,9 @@ entry:
}
define i128 @test_vldrq_p128(i128* readonly %ptr) #2 {
-; CHECK: test_vldrq_p128
-; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}]
-; CHECK-NEXT: ldr {{x[0-9]+}}, [{{x[0-9]+}}, #8]
+; CHECK-LABEL: test_vldrq_p128
+; CHECK: ldp {{x[0-9]+}}, {{x[0-9]+}}, [{{x[0-9]+}}]
+
entry:
%0 = bitcast i128* %ptr to fp128*
%1 = load fp128* %0, align 16
@@ -39,7 +39,7 @@ entry:
}
define void @test_ld_st_p128(i128* nocapture %ptr) #0 {
-; CHECK: test_ld_st_p128
+; CHECK-LABEL: test_ld_st_p128
; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}]
; CHECK-NEXT: str {{q[0-9]+}}, [{{x[0-9]+}}, #16]
entry:
diff --git a/test/CodeGen/AArch64/neon-v1i1-setcc.ll b/test/CodeGen/AArch64/aarch64-neon-v1i1-setcc.ll
index 6c7d009..c932253 100644
--- a/test/CodeGen/AArch64/neon-v1i1-setcc.ll
+++ b/test/CodeGen/AArch64/aarch64-neon-v1i1-setcc.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+; arm64 has a separate copy as aarch64-neon-v1i1-setcc.ll
; This file test the DAG node like "v1i1 SETCC v1i64, v1i64". As the v1i1 type
; is illegal in AArch64 backend, the legalizer tries to scalarize this node.
@@ -10,7 +11,7 @@
define i64 @test_sext_extr_cmp_0(<1 x i64> %v1, <1 x i64> %v2) {
; CHECK-LABEL: test_sext_extr_cmp_0:
-; CHECK: cmge d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+; CHECK: cmp {{x[0-9]+}}, {{x[0-9]+}}
%1 = icmp sge <1 x i64> %v1, %v2
%2 = extractelement <1 x i1> %1, i32 0
%vget_lane = sext i1 %2 to i64
@@ -19,7 +20,7 @@ define i64 @test_sext_extr_cmp_0(<1 x i64> %v1, <1 x i64> %v2) {
define i64 @test_sext_extr_cmp_1(<1 x double> %v1, <1 x double> %v2) {
; CHECK-LABEL: test_sext_extr_cmp_1:
-; CHECK: fcmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+; CHECK: fcmp {{d[0-9]+}}, {{d[0-9]+}}
%1 = fcmp oeq <1 x double> %v1, %v2
%2 = extractelement <1 x i1> %1, i32 0
%vget_lane = sext i1 %2 to i64
@@ -29,7 +30,7 @@ define i64 @test_sext_extr_cmp_1(<1 x double> %v1, <1 x double> %v2) {
define <1 x i64> @test_select_v1i1_0(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) {
; CHECK-LABEL: test_select_v1i1_0:
; CHECK: cmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: bsl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+; CHECK: bic v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
%1 = icmp eq <1 x i64> %v1, %v2
%res = select <1 x i1> %1, <1 x i64> zeroinitializer, <1 x i64> %v3
ret <1 x i64> %res
@@ -38,7 +39,7 @@ define <1 x i64> @test_select_v1i1_0(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3
define <1 x i64> @test_select_v1i1_1(<1 x double> %v1, <1 x double> %v2, <1 x i64> %v3) {
; CHECK-LABEL: test_select_v1i1_1:
; CHECK: fcmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: bsl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+; CHECK: bic v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
%1 = fcmp oeq <1 x double> %v1, %v2
%res = select <1 x i1> %1, <1 x i64> zeroinitializer, <1 x i64> %v3
ret <1 x i64> %res
@@ -47,7 +48,7 @@ define <1 x i64> @test_select_v1i1_1(<1 x double> %v1, <1 x double> %v2, <1 x i6
define <1 x double> @test_select_v1i1_2(<1 x i64> %v1, <1 x i64> %v2, <1 x double> %v3) {
; CHECK-LABEL: test_select_v1i1_2:
; CHECK: cmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: bsl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+; CHECK: bic v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
%1 = icmp eq <1 x i64> %v1, %v2
%res = select <1 x i1> %1, <1 x double> zeroinitializer, <1 x double> %v3
ret <1 x double> %res
diff --git a/test/CodeGen/AArch64/adc.ll b/test/CodeGen/AArch64/adc.ll
index 29637d3..892573b 100644
--- a/test/CodeGen/AArch64/adc.ll
+++ b/test/CodeGen/AArch64/adc.ll
@@ -1,5 +1,5 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-LE %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-BE %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-apple-ios7.0 | FileCheck --check-prefix=CHECK --check-prefix=CHECK-LE %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=arm64_be-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-BE %s
define i128 @test_simple(i128 %a, i128 %b, i128 %c) {
; CHECK-LABEL: test_simple:
diff --git a/test/CodeGen/AArch64/addsub-shifted.ll b/test/CodeGen/AArch64/addsub-shifted.ll
index 269c1e8..0a93edd 100644
--- a/test/CodeGen/AArch64/addsub-shifted.ll
+++ b/test/CodeGen/AArch64/addsub-shifted.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs %s -o - -mtriple=arm64-apple-ios7.0 | FileCheck %s
@var32 = global i32 0
@var64 = global i64 0
@@ -35,7 +35,7 @@ define void @test_lsl_arith(i32 %lhs32, i32 %rhs32, i64 %lhs64, i64 %rhs64) {
%shift4a = shl i32 %lhs4a, 15
%val4a = sub i32 0, %shift4a
store volatile i32 %val4a, i32* @var32
-; CHECK: sub {{w[0-9]+}}, wzr, {{w[0-9]+}}, lsl #15
+; CHECK: neg {{w[0-9]+}}, {{w[0-9]+}}, lsl #15
%rhs5 = load volatile i64* @var64
%shift5 = shl i64 %rhs5, 18
@@ -66,7 +66,7 @@ define void @test_lsl_arith(i32 %lhs32, i32 %rhs32, i64 %lhs64, i64 %rhs64) {
%shift8a = shl i64 %lhs8a, 60
%val8a = sub i64 0, %shift8a
store volatile i64 %val8a, i64* @var64
-; CHECK: sub {{x[0-9]+}}, xzr, {{x[0-9]+}}, lsl #60
+; CHECK: neg {{x[0-9]+}}, {{x[0-9]+}}, lsl #60
ret void
; CHECK: ret
@@ -99,7 +99,7 @@ define void @test_lsr_arith(i32 %lhs32, i32 %rhs32, i64 %lhs64, i64 %rhs64) {
%shift4a = lshr i32 %lhs32, 15
%val4a = sub i32 0, %shift4a
store volatile i32 %val4a, i32* @var32
-; CHECK: sub {{w[0-9]+}}, wzr, {{w[0-9]+}}, lsr #15
+; CHECK: neg {{w[0-9]+}}, {{w[0-9]+}}, lsr #15
%shift5 = lshr i64 %rhs64, 18
%val5 = add i64 %lhs64, %shift5
@@ -125,7 +125,7 @@ define void @test_lsr_arith(i32 %lhs32, i32 %rhs32, i64 %lhs64, i64 %rhs64) {
%shift8a = lshr i64 %lhs64, 45
%val8a = sub i64 0, %shift8a
store volatile i64 %val8a, i64* @var64
-; CHECK: sub {{x[0-9]+}}, xzr, {{x[0-9]+}}, lsr #45
+; CHECK: neg {{x[0-9]+}}, {{x[0-9]+}}, lsr #45
ret void
; CHECK: ret
@@ -158,7 +158,7 @@ define void @test_asr_arith(i32 %lhs32, i32 %rhs32, i64 %lhs64, i64 %rhs64) {
%shift4a = ashr i32 %lhs32, 15
%val4a = sub i32 0, %shift4a
store volatile i32 %val4a, i32* @var32
-; CHECK: sub {{w[0-9]+}}, wzr, {{w[0-9]+}}, asr #15
+; CHECK: neg {{w[0-9]+}}, {{w[0-9]+}}, asr #15
%shift5 = ashr i64 %rhs64, 18
%val5 = add i64 %lhs64, %shift5
@@ -184,7 +184,7 @@ define void @test_asr_arith(i32 %lhs32, i32 %rhs32, i64 %lhs64, i64 %rhs64) {
%shift8a = ashr i64 %lhs64, 45
%val8a = sub i64 0, %shift8a
store volatile i64 %val8a, i64* @var64
-; CHECK: sub {{x[0-9]+}}, xzr, {{x[0-9]+}}, asr #45
+; CHECK: neg {{x[0-9]+}}, {{x[0-9]+}}, asr #45
ret void
; CHECK: ret
@@ -245,7 +245,7 @@ define i32 @test_cmn(i32 %lhs32, i32 %rhs32, i64 %lhs64, i64 %rhs64) {
br i1 %tst1, label %t2, label %end
; Important that this isn't lowered to a cmn instruction because if %rhs32 ==
; 0 then the results will differ.
-; CHECK: sub [[RHS:w[0-9]+]], wzr, {{w[0-9]+}}, lsl #13
+; CHECK: neg [[RHS:w[0-9]+]], {{w[0-9]+}}, lsl #13
; CHECK: cmp {{w[0-9]+}}, [[RHS]]
t2:
@@ -268,7 +268,7 @@ t4:
%tst4 = icmp slt i64 %lhs64, %val4
br i1 %tst4, label %t5, label %end
; Again, it's important that cmn isn't used here in case %rhs64 == 0.
-; CHECK: sub [[RHS:x[0-9]+]], xzr, {{x[0-9]+}}, lsl #43
+; CHECK: neg [[RHS:x[0-9]+]], {{x[0-9]+}}, lsl #43
; CHECK: cmp {{x[0-9]+}}, [[RHS]]
t5:
diff --git a/test/CodeGen/AArch64/addsub.ll b/test/CodeGen/AArch64/addsub.ll
index 4d46d04..b85fdbb 100644
--- a/test/CodeGen/AArch64/addsub.ll
+++ b/test/CodeGen/AArch64/addsub.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-linux-gnu | FileCheck %s
; Note that this should be refactored (for efficiency if nothing else)
; when the PCS is implemented so we don't have to worry about the
@@ -28,12 +28,12 @@ define void @add_small() {
define void @add_med() {
; CHECK-LABEL: add_med:
-; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, #3567, lsl #12
+; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, {{#3567, lsl #12|#14610432}}
%val32 = load i32* @var_i32
%newval32 = add i32 %val32, 14610432 ; =0xdef000
store i32 %newval32, i32* @var_i32
-; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #4095, lsl #12
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, {{#4095, lsl #12|#16773120}}
%val64 = load i64* @var_i64
%newval64 = add i64 %val64, 16773120 ; =0xfff000
store i64 %newval64, i64* @var_i64
@@ -62,12 +62,12 @@ define void @sub_small() {
define void @sub_med() {
; CHECK-LABEL: sub_med:
-; CHECK: sub {{w[0-9]+}}, {{w[0-9]+}}, #3567, lsl #12
+; CHECK: sub {{w[0-9]+}}, {{w[0-9]+}}, {{#3567, lsl #12|#14610432}}
%val32 = load i32* @var_i32
%newval32 = sub i32 %val32, 14610432 ; =0xdef000
store i32 %newval32, i32* @var_i32
-; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, #4095, lsl #12
+; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, {{#4095, lsl #12|#16773120}}
%val64 = load i64* @var_i64
%newval64 = sub i64 %val64, 16773120 ; =0xfff000
store i64 %newval64, i64* @var_i64
@@ -80,13 +80,13 @@ define void @testing() {
%val = load i32* @var_i32
; CHECK: cmp {{w[0-9]+}}, #4095
-; CHECK: b.ne .LBB4_6
+; CHECK: b.ne [[RET:.?LBB[0-9]+_[0-9]+]]
%cmp_pos_small = icmp ne i32 %val, 4095
br i1 %cmp_pos_small, label %ret, label %test2
test2:
-; CHECK: cmp {{w[0-9]+}}, #3567, lsl #12
-; CHECK: b.lo .LBB4_6
+; CHECK: cmp {{w[0-9]+}}, {{#3567, lsl #12|#14610432}}
+; CHECK: b.lo [[RET]]
%newval2 = add i32 %val, 1
store i32 %newval2, i32* @var_i32
%cmp_pos_big = icmp ult i32 %val, 14610432
@@ -94,7 +94,7 @@ test2:
test3:
; CHECK: cmp {{w[0-9]+}}, #123
-; CHECK: b.lt .LBB4_6
+; CHECK: b.lt [[RET]]
%newval3 = add i32 %val, 2
store i32 %newval3, i32* @var_i32
%cmp_pos_slt = icmp slt i32 %val, 123
@@ -102,7 +102,7 @@ test3:
test4:
; CHECK: cmp {{w[0-9]+}}, #321
-; CHECK: b.gt .LBB4_6
+; CHECK: b.gt [[RET]]
%newval4 = add i32 %val, 3
store i32 %newval4, i32* @var_i32
%cmp_pos_sgt = icmp sgt i32 %val, 321
@@ -110,7 +110,7 @@ test4:
test5:
; CHECK: cmn {{w[0-9]+}}, #444
-; CHECK: b.gt .LBB4_6
+; CHECK: b.gt [[RET]]
%newval5 = add i32 %val, 4
store i32 %newval5, i32* @var_i32
%cmp_neg_uge = icmp sgt i32 %val, -444
diff --git a/test/CodeGen/AArch64/addsub_ext.ll b/test/CodeGen/AArch64/addsub_ext.ll
index f0e11c6..a2266b1 100644
--- a/test/CodeGen/AArch64/addsub_ext.ll
+++ b/test/CodeGen/AArch64/addsub_ext.ll
@@ -1,11 +1,11 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs %s -o - -mtriple=aarch64-linux-gnu | FileCheck %s
@var8 = global i8 0
@var16 = global i16 0
@var32 = global i32 0
@var64 = global i64 0
-define void @addsub_i8rhs() {
+define void @addsub_i8rhs() minsize {
; CHECK-LABEL: addsub_i8rhs:
%val8_tmp = load i8* @var8
%lhs32 = load i32* @var32
@@ -80,7 +80,7 @@ end:
ret void
}
-define void @addsub_i16rhs() {
+define void @addsub_i16rhs() minsize {
; CHECK-LABEL: addsub_i16rhs:
%val16_tmp = load i16* @var16
%lhs32 = load i32* @var32
@@ -158,7 +158,7 @@ end:
; N.b. we could probably check more here ("add w2, w3, w1, uxtw" for
; example), but the remaining instructions are probably not idiomatic
; in the face of "add/sub (shifted register)" so I don't intend to.
-define void @addsub_i32rhs() {
+define void @addsub_i32rhs() minsize {
; CHECK-LABEL: addsub_i32rhs:
%val32_tmp = load i32* @var32
%lhs64 = load i64* @var64
diff --git a/test/CodeGen/AArch64/alloca.ll b/test/CodeGen/AArch64/alloca.ll
index 1d3c0a0..f93efbc 100644
--- a/test/CodeGen/AArch64/alloca.ll
+++ b/test/CodeGen/AArch64/alloca.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-NOFP %s
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s --check-prefix=CHECK
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-NOFP-ARM64 %s
declare void @use_addr(i8*)
@@ -8,23 +8,22 @@ define void @test_simple_alloca(i64 %n) {
%buf = alloca i8, i64 %n
; Make sure we align the stack change to 16 bytes:
-; CHECK-DAG: add [[SPDELTA:x[0-9]+]], x0, #15
-; CHECK-DAG: and x0, [[SPDELTA]], #0xfffffffffffffff0
+; CHECK: {{mov|add}} x29
+; CHECK: mov [[TMP:x[0-9]+]], sp
+; CHECK: add [[SPDELTA_TMP:x[0-9]+]], x0, #15
+; CHECK: and [[SPDELTA:x[0-9]+]], [[SPDELTA_TMP]], #0xfffffffffffffff0
; Make sure we change SP. It would be surprising if anything but x0 were used
; for the final sp, but it could be if it was then moved into x0.
-; CHECK-DAG: mov [[TMP:x[0-9]+]], sp
-; CHECK-DAG: sub x0, [[TMP]], [[SPDELTA]]
-; CHECK: mov sp, x0
+; CHECK: sub [[NEWSP:x[0-9]+]], [[TMP]], [[SPDELTA]]
+; CHECK: mov sp, [[NEWSP]]
call void @use_addr(i8* %buf)
; CHECK: bl use_addr
ret void
; Make sure epilogue restores sp from fp
-; CHECK: sub sp, x29, #16
-; CHECK: ldp x29, x30, [sp, #16]
-; CHECK: add sp, sp, #32
+; CHECK: {{sub|mov}} sp, x29
; CHECK: ret
}
@@ -32,57 +31,70 @@ declare void @use_addr_loc(i8*, i64*)
define i64 @test_alloca_with_local(i64 %n) {
; CHECK-LABEL: test_alloca_with_local:
-; CHECK: sub sp, sp, #32
-; CHECK: stp x29, x30, [sp, #16]
+; CHECK-DAG: sub sp, sp, [[LOCAL_STACK:#[0-9]+]]
+; CHECK-DAG: {{mov|add}} x29, sp
%loc = alloca i64
%buf = alloca i8, i64 %n
; Make sure we align the stack change to 16 bytes:
-; CHECK-DAG: add [[SPDELTA:x[0-9]+]], x0, #15
-; CHECK-DAG: and x0, [[SPDELTA]], #0xfffffffffffffff0
+; CHECK: mov [[TMP:x[0-9]+]], sp
+; CHECK: add [[SPDELTA_TMP:x[0-9]+]], x0, #15
+; CHECK: and [[SPDELTA:x[0-9]+]], [[SPDELTA_TMP]], #0xfffffffffffffff0
; Make sure we change SP. It would be surprising if anything but x0 were used
; for the final sp, but it could be if it was then moved into x0.
-; CHECK-DAG: mov [[TMP:x[0-9]+]], sp
-; CHECK-DAG: sub x0, [[TMP]], [[SPDELTA]]
-; CHECK: mov sp, x0
+; CHECK: sub [[NEWSP:x[0-9]+]], [[TMP]], [[SPDELTA]]
+; CHECK: mov sp, [[NEWSP]]
- ; Obviously suboptimal code here, but it to get &local in x1
-; CHECK: sub [[TMP:x[0-9]+]], x29, [[LOC_FROM_FP:#[0-9]+]]
-; CHECK: add x1, [[TMP]], #0
+; CHECK: sub {{x[0-9]+}}, x29, #[[LOC_FROM_FP:[0-9]+]]
call void @use_addr_loc(i8* %buf, i64* %loc)
; CHECK: bl use_addr
%val = load i64* %loc
-; CHECK: sub x[[TMP:[0-9]+]], x29, [[LOC_FROM_FP]]
-; CHECK: ldr x0, [x[[TMP]]]
+
+; CHECK: ldur x0, [x29, #-[[LOC_FROM_FP]]]
ret i64 %val
; Make sure epilogue restores sp from fp
-; CHECK: sub sp, x29, #16
-; CHECK: ldp x29, x30, [sp, #16]
-; CHECK: add sp, sp, #32
+; CHECK: {{sub|mov}} sp, x29
; CHECK: ret
}
define void @test_variadic_alloca(i64 %n, ...) {
-; CHECK: test_variadic_alloca:
-
-; CHECK: sub sp, sp, #208
-; CHECK: stp x29, x30, [sp, #192]
-; CHECK: add x29, sp, #192
-; CHECK: sub [[TMP:x[0-9]+]], x29, #192
-; CHECK: add x8, [[TMP]], #0
-; CHECK-FP: str q7, [x8, #112]
+; CHECK-LABEL: test_variadic_alloca:
+
; [...]
-; CHECK-FP: str q1, [x8, #16]
-; CHECK-NOFP: sub sp, sp, #80
-; CHECK-NOFP: stp x29, x30, [sp, #64]
-; CHECK-NOFP: add x29, sp, #64
-; CHECK-NOFP: sub [[TMP:x[0-9]+]], x29, #64
-; CHECK-NOFP: add x8, [[TMP]], #0
+
+; CHECK-NOFP-AARCH64: sub sp, sp, #80
+; CHECK-NOFP-AARCH64: stp x29, x30, [sp, #64]
+; CHECK-NOFP-AARCH64: add x29, sp, #64
+; CHECK-NOFP-AARCH64: sub [[TMP:x[0-9]+]], x29, #64
+; CHECK-NOFP-AARCH64: add x8, [[TMP]], #0
+
+
+; CHECK: stp x29, x30, [sp, #-16]!
+; CHECK: mov x29, sp
+; CHECK: sub sp, sp, #192
+; CHECK: stp q6, q7, [x29, #-96]
+; [...]
+; CHECK: stp q0, q1, [x29, #-192]
+
+; CHECK: stp x6, x7, [x29, #-16]
+; [...]
+; CHECK: stp x2, x3, [x29, #-48]
+
+; CHECK-NOFP-ARM64: stp x29, x30, [sp, #-16]!
+; CHECK-NOFP-ARM64: mov x29, sp
+; CHECK-NOFP-ARM64: sub sp, sp, #64
+; CHECK-NOFP-ARM64: stp x6, x7, [x29, #-16]
+; [...]
+; CHECK-NOFP-ARM64: stp x4, x5, [x29, #-32]
+; [...]
+; CHECK-NOFP-ARM64: stp x2, x3, [x29, #-48]
+; [...]
+; CHECK-NOFP-ARM64: mov x8, sp
%addr = alloca i8, i64 %n
@@ -90,23 +102,24 @@ define void @test_variadic_alloca(i64 %n, ...) {
; CHECK: bl use_addr
ret void
-; CHECK: sub sp, x29, #192
-; CHECK: ldp x29, x30, [sp, #192]
-; CHECK: add sp, sp, #208
-; CHECK-NOFP: sub sp, x29, #64
-; CHECK-NOFP: ldp x29, x30, [sp, #64]
-; CHECK-NOFP: add sp, sp, #80
+; CHECK-NOFP-AARCH64: sub sp, x29, #64
+; CHECK-NOFP-AARCH64: ldp x29, x30, [sp, #64]
+; CHECK-NOFP-AARCH64: add sp, sp, #80
+
+; CHECK-NOFP-ARM64: mov sp, x29
+; CHECK-NOFP-ARM64: ldp x29, x30, [sp], #16
}
define void @test_alloca_large_frame(i64 %n) {
; CHECK-LABEL: test_alloca_large_frame:
-; CHECK: sub sp, sp, #496
-; CHECK: stp x29, x30, [sp, #480]
-; CHECK: add x29, sp, #480
-; CHECK: sub sp, sp, #48
-; CHECK: sub sp, sp, #1953, lsl #12
+
+; CHECK: stp x20, x19, [sp, #-32]!
+; CHECK: stp x29, x30, [sp, #16]
+; CHECK: add x29, sp, #16
+; CHECK: sub sp, sp, #1953, lsl #12
+; CHECK: sub sp, sp, #512
%addr1 = alloca i8, i64 %n
%addr2 = alloca i64, i64 1000000
@@ -114,9 +127,10 @@ define void @test_alloca_large_frame(i64 %n) {
call void @use_addr_loc(i8* %addr1, i64* %addr2)
ret void
-; CHECK: sub sp, x29, #480
-; CHECK: ldp x29, x30, [sp, #480]
-; CHECK: add sp, sp, #496
+
+; CHECK: sub sp, x29, #16
+; CHECK: ldp x29, x30, [sp, #16]
+; CHECK: ldp x20, x19, [sp], #32
}
declare i8* @llvm.stacksave()
@@ -124,7 +138,6 @@ declare void @llvm.stackrestore(i8*)
define void @test_scoped_alloca(i64 %n) {
; CHECK-LABEL: test_scoped_alloca:
-; CHECK: sub sp, sp, #32
%sp = call i8* @llvm.stacksave()
; CHECK: mov [[SAVED_SP:x[0-9]+]], sp
diff --git a/test/CodeGen/AArch64/analyze-branch.ll b/test/CodeGen/AArch64/analyze-branch.ll
index 36bc2e0..6616b27 100644
--- a/test/CodeGen/AArch64/analyze-branch.ll
+++ b/test/CodeGen/AArch64/analyze-branch.ll
@@ -168,7 +168,7 @@ define void @test_TBZ_fallthrough_nottaken(i64 %in) nounwind {
%tst = icmp eq i64 %bit, 0
br i1 %tst, label %true, label %false, !prof !1
-; CHECK: tbz {{x[0-9]+}}, #15, [[TRUE:.LBB[0-9]+_[0-9]+]]
+; CHECK: tbz {{[wx][0-9]+}}, #15, [[TRUE:.LBB[0-9]+_[0-9]+]]
; CHECK-NEXT: // BB#
; CHECK-NEXT: bl test_false
@@ -213,7 +213,7 @@ define void @test_TBNZ_fallthrough_nottaken(i64 %in) nounwind {
%tst = icmp ne i64 %bit, 0
br i1 %tst, label %true, label %false, !prof !1
-; CHECK: tbnz {{x[0-9]+}}, #15, [[TRUE:.LBB[0-9]+_[0-9]+]]
+; CHECK: tbnz {{[wx][0-9]+}}, #15, [[TRUE:.LBB[0-9]+_[0-9]+]]
; CHECK-NEXT: // BB#
; CHECK-NEXT: bl test_false
diff --git a/test/CodeGen/AArch64/arm64-2011-03-09-CPSRSpill.ll b/test/CodeGen/AArch64/arm64-2011-03-09-CPSRSpill.ll
new file mode 100644
index 0000000..6fb7c3f
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2011-03-09-CPSRSpill.ll
@@ -0,0 +1,47 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin
+
+; Can't copy or spill / restore CPSR.
+; rdar://9105206
+
+define fastcc void @t() ssp align 2 {
+entry:
+ br i1 undef, label %bb3.i, label %bb2.i
+
+bb2.i: ; preds = %entry
+ br label %bb3.i
+
+bb3.i: ; preds = %bb2.i, %entry
+ br i1 undef, label %_ZN12gjkepa2_impl3EPA6appendERNS0_5sListEPNS0_5sFaceE.exit71, label %bb.i69
+
+bb.i69: ; preds = %bb3.i
+ br label %_ZN12gjkepa2_impl3EPA6appendERNS0_5sListEPNS0_5sFaceE.exit71
+
+_ZN12gjkepa2_impl3EPA6appendERNS0_5sListEPNS0_5sFaceE.exit71: ; preds = %bb.i69, %bb3.i
+ %0 = select i1 undef, float 0.000000e+00, float undef
+ %1 = fdiv float %0, undef
+ %2 = fcmp ult float %1, 0xBF847AE140000000
+ %storemerge9 = select i1 %2, float %1, float 0.000000e+00
+ store float %storemerge9, float* undef, align 4
+ br i1 undef, label %bb42, label %bb47
+
+bb42: ; preds = %_ZN12gjkepa2_impl3EPA6appendERNS0_5sListEPNS0_5sFaceE.exit71
+ br i1 undef, label %bb46, label %bb53
+
+bb46: ; preds = %bb42
+ br label %bb48
+
+bb47: ; preds = %_ZN12gjkepa2_impl3EPA6appendERNS0_5sListEPNS0_5sFaceE.exit71
+ br label %bb48
+
+bb48: ; preds = %bb47, %bb46
+ br i1 undef, label %bb1.i14, label %bb.i13
+
+bb.i13: ; preds = %bb48
+ br label %bb1.i14
+
+bb1.i14: ; preds = %bb.i13, %bb48
+ br label %bb53
+
+bb53: ; preds = %bb1.i14, %bb42
+ ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll b/test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll
new file mode 100644
index 0000000..2b083d8
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll
@@ -0,0 +1,45 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin
+
+; rdar://9146594
+
+define void @drt_vsprintf() nounwind ssp {
+entry:
+ %do_tab_convert = alloca i32, align 4
+ br i1 undef, label %if.then24, label %if.else295, !dbg !13
+
+if.then24: ; preds = %entry
+ unreachable
+
+if.else295: ; preds = %entry
+ call void @llvm.dbg.declare(metadata !{i32* %do_tab_convert}, metadata !16), !dbg !18
+ store i32 0, i32* %do_tab_convert, align 4, !dbg !19
+ unreachable
+}
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+!llvm.dbg.gv = !{!0}
+!llvm.dbg.sp = !{!1, !7, !10, !11, !12}
+
+!0 = metadata !{i32 589876, i32 0, metadata !1, metadata !"vsplive", metadata !"vsplive", metadata !"", metadata !2, i32 617, metadata !6, i32 1, i32 1, null, null} ; [ DW_TAG_variable ]
+!1 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"drt_vsprintf", metadata !"drt_vsprintf", metadata !"", i32 616, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!2 = metadata !{i32 589865, metadata !20} ; [ DW_TAG_file_type ]
+!3 = metadata !{i32 589841, metadata !20, i32 12, metadata !"clang version 3.0 (http://llvm.org/git/clang.git git:/git/puzzlebox/clang.git/ c4d1aea01c4444eb81bdbf391f1be309127c3cf1)", i1 true, metadata !"", i32 0, metadata !21, metadata !21, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
+!4 = metadata !{i32 589845, metadata !20, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !5, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!5 = metadata !{metadata !6}
+!6 = metadata !{i32 589860, null, metadata !3, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!7 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"putc_mem", metadata !"putc_mem", metadata !"", i32 30, metadata !8, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!8 = metadata !{i32 589845, metadata !20, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !9, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!9 = metadata !{null}
+!10 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"print_double", metadata !"print_double", metadata !"", i32 203, metadata !4, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!11 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"print_number", metadata !"print_number", metadata !"", i32 75, metadata !4, i1 true, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!12 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"get_flags", metadata !"get_flags", metadata !"", i32 508, metadata !8, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!13 = metadata !{i32 653, i32 5, metadata !14, null}
+!14 = metadata !{i32 589835, metadata !20, metadata !15, i32 652, i32 35, i32 2} ; [ DW_TAG_lexical_block ]
+!15 = metadata !{i32 589835, metadata !20, metadata !1, i32 616, i32 1, i32 0} ; [ DW_TAG_lexical_block ]
+!16 = metadata !{i32 590080, metadata !17, metadata !"do_tab_convert", metadata !2, i32 853, metadata !6, i32 0, null} ; [ DW_TAG_auto_variable ]
+!17 = metadata !{i32 589835, metadata !20, metadata !14, i32 850, i32 12, i32 33} ; [ DW_TAG_lexical_block ]
+!18 = metadata !{i32 853, i32 11, metadata !17, null}
+!19 = metadata !{i32 853, i32 29, metadata !17, null}
+!20 = metadata !{metadata !"print.i", metadata !"/Volumes/Ebi/echeng/radars/r9146594"}
+!21 = metadata !{i32 0}
diff --git a/test/CodeGen/AArch64/arm64-2011-03-21-Unaligned-Frame-Index.ll b/test/CodeGen/AArch64/arm64-2011-03-21-Unaligned-Frame-Index.ll
new file mode 100644
index 0000000..6f0ec34
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2011-03-21-Unaligned-Frame-Index.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+define void @foo(i64 %val) {
+; CHECK: foo
+; The stack frame store is not 64-bit aligned. Make sure we use an
+; instruction that can handle that.
+; CHECK: stur x0, [sp, #20]
+ %a = alloca [49 x i32], align 4
+ %p32 = getelementptr inbounds [49 x i32]* %a, i64 0, i64 2
+ %p = bitcast i32* %p32 to i64*
+ store i64 %val, i64* %p, align 8
+ ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-2011-04-21-CPSRBug.ll b/test/CodeGen/AArch64/arm64-2011-04-21-CPSRBug.ll
new file mode 100644
index 0000000..88232fc
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2011-04-21-CPSRBug.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -mtriple=arm64-apple-iOS5.0
+
+; CPSR is not allocatable so fast allocatable wouldn't mark them killed.
+; rdar://9313272
+
+define hidden void @t() nounwind {
+entry:
+ %cmp = icmp eq i32* null, undef
+ %frombool = zext i1 %cmp to i8
+ store i8 %frombool, i8* undef, align 1
+ %tmp4 = load i8* undef, align 1
+ %tobool = trunc i8 %tmp4 to i1
+ br i1 %tobool, label %land.lhs.true, label %if.end
+
+land.lhs.true: ; preds = %entry
+ unreachable
+
+if.end: ; preds = %entry
+ br i1 undef, label %land.lhs.true14, label %if.end33
+
+land.lhs.true14: ; preds = %if.end
+ unreachable
+
+if.end33: ; preds = %if.end
+ unreachable
+}
diff --git a/test/CodeGen/AArch64/arm64-2011-10-18-LdStOptBug.ll b/test/CodeGen/AArch64/arm64-2011-10-18-LdStOptBug.ll
new file mode 100644
index 0000000..8f99bc3
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2011-10-18-LdStOptBug.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios | FileCheck %s
+
+; Can't fold the increment by 1<<12 into a post-increment load
+; rdar://10301335
+
+@test_data = common global i32 0, align 4
+
+define void @t() nounwind ssp {
+; CHECK-LABEL: t:
+entry:
+ br label %for.body
+
+for.body:
+; CHECK: for.body
+; CHECK: ldr w{{[0-9]+}}, [x{{[0-9]+}}]
+; CHECK: add x[[REG:[0-9]+]],
+; CHECK: x[[REG]], #1, lsl #12
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %0 = shl nsw i64 %indvars.iv, 12
+ %add = add nsw i64 %0, 34628173824
+ %1 = inttoptr i64 %add to i32*
+ %2 = load volatile i32* %1, align 4096
+ store volatile i32 %2, i32* @test_data, align 4
+ %indvars.iv.next = add i64 %indvars.iv, 1
+ %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+ %exitcond = icmp eq i32 %lftr.wideiv, 200
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-2012-01-11-ComparisonDAGCrash.ll b/test/CodeGen/AArch64/arm64-2012-01-11-ComparisonDAGCrash.ll
new file mode 100644
index 0000000..d47dbb2
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2012-01-11-ComparisonDAGCrash.ll
@@ -0,0 +1,40 @@
+; RUN: llc < %s -march=arm64
+
+; The target lowering for integer comparisons was replacing some DAG nodes
+; during operation legalization, which resulted in dangling pointers,
+; cycles in DAGs, and eventually crashes. This is the testcase for
+; one of those crashes. (rdar://10653656)
+
+define void @test(i1 zeroext %IsArrow) nounwind ssp align 2 {
+entry:
+ br i1 undef, label %return, label %lor.lhs.false
+
+lor.lhs.false:
+ br i1 undef, label %return, label %if.end
+
+if.end:
+ %tmp.i = load i64* undef, align 8
+ %and.i.i.i = and i64 %tmp.i, -16
+ br i1 %IsArrow, label %if.else_crit_edge, label %if.end32
+
+if.else_crit_edge:
+ br i1 undef, label %if.end32, label %return
+
+if.end32:
+ %0 = icmp ult i32 undef, 3
+ %1 = zext i64 %tmp.i to i320
+ %.pn.v = select i1 %0, i320 128, i320 64
+ %.pn = shl i320 %1, %.pn.v
+ %ins346392 = or i320 %.pn, 0
+ store i320 %ins346392, i320* undef, align 8
+ br i1 undef, label %sw.bb.i.i, label %exit
+
+sw.bb.i.i:
+ unreachable
+
+exit:
+ unreachable
+
+return:
+ ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-2012-05-07-DAGCombineVectorExtract.ll b/test/CodeGen/AArch64/arm64-2012-05-07-DAGCombineVectorExtract.ll
new file mode 100644
index 0000000..a4d37e4
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2012-05-07-DAGCombineVectorExtract.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define i32 @foo(<4 x i32> %a, i32 %n) nounwind {
+; CHECK-LABEL: foo:
+; CHECK: fmov w0, s0
+; CHECK-NEXT: ret
+ %b = bitcast <4 x i32> %a to i128
+ %c = trunc i128 %b to i32
+ ret i32 %c
+}
+
+define i64 @bar(<2 x i64> %a, i64 %n) nounwind {
+; CHECK-LABEL: bar:
+; CHECK: fmov x0, d0
+; CHECK-NEXT: ret
+ %b = bitcast <2 x i64> %a to i128
+ %c = trunc i128 %b to i64
+ ret i64 %c
+}
+
diff --git a/test/CodeGen/AArch64/arm64-2012-05-07-MemcpyAlignBug.ll b/test/CodeGen/AArch64/arm64-2012-05-07-MemcpyAlignBug.ll
new file mode 100644
index 0000000..d59b0d0
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2012-05-07-MemcpyAlignBug.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -march arm64 -mcpu=cyclone | FileCheck %s
+; <rdar://problem/11294426>
+
+@b = private unnamed_addr constant [3 x i32] [i32 1768775988, i32 1685481784, i32 1836253201], align 4
+
+; The important thing for this test is that we need an unaligned load of `l_b'
+; ("ldr w2, [x1, #8]" in this case).
+
+; CHECK: adrp x[[PAGE:[0-9]+]], {{l_b@PAGE|.Lb}}
+; CHECK: add x[[ADDR:[0-9]+]], x[[PAGE]], {{l_b@PAGEOFF|:lo12:.Lb}}
+; CHECK-NEXT: ldr [[VAL:w[0-9]+]], [x[[ADDR]], #8]
+; CHECK-NEXT: str [[VAL]], [x0, #8]
+; CHECK-NEXT: ldr [[VAL2:x[0-9]+]], [x[[ADDR]]]
+; CHECK-NEXT: str [[VAL2]], [x0]
+
+define void @foo(i8* %a) {
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* bitcast ([3 x i32]* @b to i8*), i64 12, i32 4, i1 false)
+ ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
diff --git a/test/CodeGen/AArch64/arm64-2012-05-09-LOADgot-bug.ll b/test/CodeGen/AArch64/arm64-2012-05-09-LOADgot-bug.ll
new file mode 100644
index 0000000..d1840d3
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2012-05-09-LOADgot-bug.ll
@@ -0,0 +1,22 @@
+; RUN: llc -mtriple=arm64-apple-ios < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu -relocation-model=pic < %s | FileCheck %s --check-prefix=CHECK-LINUX
+; <rdar://problem/11392109>
+
+define hidden void @t() optsize ssp {
+entry:
+ store i64 zext (i32 ptrtoint (i64 (i32)* @x to i32) to i64), i64* undef, align 8
+; CHECK: adrp x{{[0-9]+}}, _x@GOTPAGE
+; CHECK: ldr x{{[0-9]+}}, [x{{[0-9]+}}, _x@GOTPAGEOFF]
+; CHECK-NEXT: and x{{[0-9]+}}, x{{[0-9]+}}, #0xffffffff
+; CHECK-NEXT: str x{{[0-9]+}}, [x{{[0-9]+}}]
+ unreachable
+}
+
+declare i64 @x(i32) optsize
+
+; Worth checking the Linux code is sensible too: only way to access
+; the GOT is via a 64-bit load. Just loading wN is unacceptable
+; (there's no ELF relocation to do that).
+
+; CHECK-LINUX: adrp {{x[0-9]+}}, :got:x
+; CHECK-LINUX: ldr {{x[0-9]+}}, [{{x[0-9]+}}, :got_lo12:x]
diff --git a/test/CodeGen/AArch64/arm64-2012-05-22-LdStOptBug.ll b/test/CodeGen/AArch64/arm64-2012-05-22-LdStOptBug.ll
new file mode 100644
index 0000000..4b037db
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2012-05-22-LdStOptBug.ll
@@ -0,0 +1,50 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios -verify-machineinstrs | FileCheck %s
+
+; LdStOpt bug created illegal instruction:
+; %D1<def>, %D2<def> = LDPSi %X0, 1
+; rdar://11512047
+
+%0 = type opaque
+%struct.CGRect = type { %struct.CGPoint, %struct.CGSize }
+%struct.CGPoint = type { double, double }
+%struct.CGSize = type { double, double }
+
+@"OBJC_IVAR_$_UIScreen._bounds" = external hidden global i64, section "__DATA, __objc_ivar", align 8
+
+define hidden %struct.CGRect @t(%0* nocapture %self, i8* nocapture %_cmd) nounwind readonly optsize ssp {
+entry:
+; CHECK-LABEL: t:
+; CHECK: ldp d{{[0-9]+}}, d{{[0-9]+}}
+ %ivar = load i64* @"OBJC_IVAR_$_UIScreen._bounds", align 8, !invariant.load !4
+ %0 = bitcast %0* %self to i8*
+ %add.ptr = getelementptr inbounds i8* %0, i64 %ivar
+ %add.ptr10.0 = bitcast i8* %add.ptr to double*
+ %tmp11 = load double* %add.ptr10.0, align 8
+ %add.ptr.sum = add i64 %ivar, 8
+ %add.ptr10.1 = getelementptr inbounds i8* %0, i64 %add.ptr.sum
+ %1 = bitcast i8* %add.ptr10.1 to double*
+ %tmp12 = load double* %1, align 8
+ %add.ptr.sum17 = add i64 %ivar, 16
+ %add.ptr4.1 = getelementptr inbounds i8* %0, i64 %add.ptr.sum17
+ %add.ptr4.1.0 = bitcast i8* %add.ptr4.1 to double*
+ %tmp = load double* %add.ptr4.1.0, align 8
+ %add.ptr4.1.sum = add i64 %ivar, 24
+ %add.ptr4.1.1 = getelementptr inbounds i8* %0, i64 %add.ptr4.1.sum
+ %2 = bitcast i8* %add.ptr4.1.1 to double*
+ %tmp5 = load double* %2, align 8
+ %insert14 = insertvalue %struct.CGPoint undef, double %tmp11, 0
+ %insert16 = insertvalue %struct.CGPoint %insert14, double %tmp12, 1
+ %insert = insertvalue %struct.CGRect undef, %struct.CGPoint %insert16, 0
+ %insert7 = insertvalue %struct.CGSize undef, double %tmp, 0
+ %insert9 = insertvalue %struct.CGSize %insert7, double %tmp5, 1
+ %insert3 = insertvalue %struct.CGRect %insert, %struct.CGSize %insert9, 1
+ ret %struct.CGRect %insert3
+}
+
+!llvm.module.flags = !{!0, !1, !2, !3}
+
+!0 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
+!1 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
+!2 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
+!3 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0}
+!4 = metadata !{}
diff --git a/test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll b/test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll
new file mode 100644
index 0000000..168e921
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll
@@ -0,0 +1,67 @@
+; RUN: llc -march=arm64 -O0 < %s | FileCheck %s
+; RUN: llc -march=arm64 -O3 < %s | FileCheck %s
+
+@.str = private unnamed_addr constant [9 x i8] c"%lf %lu\0A\00", align 1
+@.str1 = private unnamed_addr constant [8 x i8] c"%lf %u\0A\00", align 1
+@.str2 = private unnamed_addr constant [8 x i8] c"%f %lu\0A\00", align 1
+@.str3 = private unnamed_addr constant [7 x i8] c"%f %u\0A\00", align 1
+
+define void @testDouble(double %d) ssp {
+; CHECK-LABEL: testDouble:
+; CHECK: fcvtzu x{{[0-9]+}}, d{{[0-9]+}}
+; CHECK: fcvtzu w{{[0-9]+}}, d{{[0-9]+}}
+entry:
+ %d.addr = alloca double, align 8
+ store double %d, double* %d.addr, align 8
+ %0 = load double* %d.addr, align 8
+ %1 = load double* %d.addr, align 8
+ %conv = fptoui double %1 to i64
+ %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8]* @.str, i32 0, i32 0), double %0, i64 %conv)
+ %2 = load double* %d.addr, align 8
+ %3 = load double* %d.addr, align 8
+ %conv1 = fptoui double %3 to i32
+ %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8]* @.str1, i32 0, i32 0), double %2, i32 %conv1)
+ ret void
+}
+
+declare i32 @printf(i8*, ...)
+
+define void @testFloat(float %f) ssp {
+; CHECK-LABEL: testFloat:
+; CHECK: fcvtzu x{{[0-9]+}}, s{{[0-9]+}}
+; CHECK: fcvtzu w{{[0-9]+}}, s{{[0-9]+}}
+entry:
+ %f.addr = alloca float, align 4
+ store float %f, float* %f.addr, align 4
+ %0 = load float* %f.addr, align 4
+ %conv = fpext float %0 to double
+ %1 = load float* %f.addr, align 4
+ %conv1 = fptoui float %1 to i64
+ %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8]* @.str2, i32 0, i32 0), double %conv, i64 %conv1)
+ %2 = load float* %f.addr, align 4
+ %conv2 = fpext float %2 to double
+ %3 = load float* %f.addr, align 4
+ %conv3 = fptoui float %3 to i32
+ %call4 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([7 x i8]* @.str3, i32 0, i32 0), double %conv2, i32 %conv3)
+ ret void
+}
+
+define i32 @main(i32 %argc, i8** %argv) ssp {
+entry:
+ %retval = alloca i32, align 4
+ %argc.addr = alloca i32, align 4
+ %argv.addr = alloca i8**, align 8
+ store i32 0, i32* %retval
+ store i32 %argc, i32* %argc.addr, align 4
+ store i8** %argv, i8*** %argv.addr, align 8
+ call void @testDouble(double 1.159198e+01)
+ call void @testFloat(float 0x40272F1800000000)
+ ret i32 0
+}
+
+!llvm.module.flags = !{!0, !1, !2, !3}
+
+!0 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
+!1 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
+!2 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
+!3 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0}
diff --git a/test/CodeGen/AArch64/arm64-2012-07-11-InstrEmitterBug.ll b/test/CodeGen/AArch64/arm64-2012-07-11-InstrEmitterBug.ll
new file mode 100644
index 0000000..55ecfb5
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2012-07-11-InstrEmitterBug.ll
@@ -0,0 +1,56 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios
+; rdar://11849816
+
+@shlib_path_substitutions = external hidden unnamed_addr global i8**, align 8
+
+declare i64 @llvm.objectsize.i64(i8*, i1) nounwind readnone
+
+declare noalias i8* @xmalloc(i64) optsize
+
+declare i64 @strlen(i8* nocapture) nounwind readonly optsize
+
+declare i8* @__strcpy_chk(i8*, i8*, i64) nounwind optsize
+
+declare i8* @__strcat_chk(i8*, i8*, i64) nounwind optsize
+
+declare noalias i8* @xstrdup(i8*) optsize
+
+define i8* @dyld_fix_path(i8* %path) nounwind optsize ssp {
+entry:
+ br i1 undef, label %if.end56, label %for.cond
+
+for.cond: ; preds = %entry
+ br i1 undef, label %for.cond10, label %for.body
+
+for.body: ; preds = %for.cond
+ unreachable
+
+for.cond10: ; preds = %for.cond
+ br i1 undef, label %if.end56, label %for.body14
+
+for.body14: ; preds = %for.cond10
+ %call22 = tail call i64 @strlen(i8* undef) nounwind optsize
+ %sext = shl i64 %call22, 32
+ %conv30 = ashr exact i64 %sext, 32
+ %add29 = sub i64 0, %conv30
+ %sub = add i64 %add29, 0
+ %add31 = shl i64 %sub, 32
+ %sext59 = add i64 %add31, 4294967296
+ %conv33 = ashr exact i64 %sext59, 32
+ %call34 = tail call noalias i8* @xmalloc(i64 %conv33) nounwind optsize
+ br i1 undef, label %cond.false45, label %cond.true43
+
+cond.true43: ; preds = %for.body14
+ unreachable
+
+cond.false45: ; preds = %for.body14
+ %add.ptr = getelementptr inbounds i8* %path, i64 %conv30
+ unreachable
+
+if.end56: ; preds = %for.cond10, %entry
+ ret i8* null
+}
+
+declare i32 @strncmp(i8* nocapture, i8* nocapture, i64) nounwind readonly optsize
+
+declare i8* @strcpy(i8*, i8* nocapture) nounwind
diff --git a/test/CodeGen/AArch64/arm64-2013-01-13-ffast-fcmp.ll b/test/CodeGen/AArch64/arm64-2013-01-13-ffast-fcmp.ll
new file mode 100644
index 0000000..e2c43d9
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2013-01-13-ffast-fcmp.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -fp-contract=fast | FileCheck %s --check-prefix=FAST
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+target triple = "arm64-apple-ios7.0.0"
+
+;FAST-LABEL: _Z9example25v:
+;FAST: fcmgt.4s
+;FAST: ret
+
+;CHECK-LABEL: _Z9example25v:
+;CHECK: fcmgt.4s
+;CHECK: ret
+
+define <4 x i32> @_Z9example25v( <4 x float> %N0, <4 x float> %N1) {
+ %A = fcmp olt <4 x float> %N0, %N1
+ %B = zext <4 x i1> %A to <4 x i32>
+ ret <4 x i32> %B
+}
diff --git a/test/CodeGen/AArch64/arm64-2013-01-23-frem-crash.ll b/test/CodeGen/AArch64/arm64-2013-01-23-frem-crash.ll
new file mode 100644
index 0000000..9451124
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2013-01-23-frem-crash.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -march=arm64
+; Make sure we are not crashing on this test.
+
+define void @autogen_SD13158() {
+entry:
+ %B26 = frem float 0.000000e+00, undef
+ br i1 undef, label %CF, label %CF77
+
+CF: ; preds = %CF, %CF76
+ store float %B26, float* undef
+ br i1 undef, label %CF, label %CF77
+
+CF77: ; preds = %CF
+ ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-2013-01-23-sext-crash.ll b/test/CodeGen/AArch64/arm64-2013-01-23-sext-crash.ll
new file mode 100644
index 0000000..404027b
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2013-01-23-sext-crash.ll
@@ -0,0 +1,37 @@
+; RUN: llc < %s -march=arm64
+
+; Make sure we are not crashing on this test.
+
+define void @autogen_SD12881() {
+BB:
+ %B17 = ashr <4 x i32> zeroinitializer, zeroinitializer
+ br label %CF
+
+CF: ; preds = %CF83, %CF, %BB
+ br i1 undef, label %CF, label %CF83
+
+CF83: ; preds = %CF
+ %FC70 = sitofp <4 x i32> %B17 to <4 x double>
+ br label %CF
+}
+
+
+define void @autogen_SD12881_2() {
+BB:
+ %B17 = ashr <4 x i32> zeroinitializer, zeroinitializer
+ br label %CF
+
+CF: ; preds = %CF83, %CF, %BB
+ br i1 undef, label %CF, label %CF83
+
+CF83: ; preds = %CF
+ %FC70 = uitofp <4 x i32> %B17 to <4 x double>
+ br label %CF
+}
+
+define void @_Z12my_example2bv() nounwind noinline ssp {
+entry:
+ %0 = fptosi <2 x double> undef to <2 x i32>
+ store <2 x i32> %0, <2 x i32>* undef, align 8
+ ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-2013-02-12-shufv8i8.ll b/test/CodeGen/AArch64/arm64-2013-02-12-shufv8i8.ll
new file mode 100644
index 0000000..a350ba1
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2013-02-12-shufv8i8.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple
+
+;CHECK-LABEL: Shuff:
+;CHECK: tbl.8b
+;CHECK: ret
+define <8 x i8 > @Shuff(<8 x i8> %in, <8 x i8>* %out) nounwind ssp {
+ %value = shufflevector <8 x i8> %in, <8 x i8> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i8> %value
+}
+
+
diff --git a/test/CodeGen/AArch64/arm64-2014-04-16-AnInfiniteLoopInDAGCombine.ll b/test/CodeGen/AArch64/arm64-2014-04-16-AnInfiniteLoopInDAGCombine.ll
new file mode 100644
index 0000000..a73b707
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2014-04-16-AnInfiniteLoopInDAGCombine.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -march=arm64
+
+; This test case tests an infinite loop bug in DAG combiner.
+; It just tries to do the following replacing endlessly:
+; (1) Replacing.3 0x2c509f0: v4i32 = any_extend 0x2c4cd08 [ORD=4]
+; With: 0x2c4d128: v4i32 = sign_extend 0x2c4cd08 [ORD=4]
+;
+; (2) Replacing.2 0x2c4d128: v4i32 = sign_extend 0x2c4cd08 [ORD=4]
+; With: 0x2c509f0: v4i32 = any_extend 0x2c4cd08 [ORD=4]
+; As we think the (2) optimization from SIGN_EXTEND to ANY_EXTEND is
+; an optimization to replace unused bits with undefined bits, we remove
+; the (1) optimization (It doesn't make sense to replace undefined bits
+; with signed bits).
+
+define <4 x i32> @infiniteLoop(<4 x i32> %in0, <4 x i16> %in1) {
+entry:
+ %cmp.i = icmp sge <4 x i16> %in1, <i16 32767, i16 32767, i16 -1, i16 -32768>
+ %sext.i = sext <4 x i1> %cmp.i to <4 x i32>
+ %mul.i = mul <4 x i32> %in0, %sext.i
+ %sext = shl <4 x i32> %mul.i, <i32 16, i32 16, i32 16, i32 16>
+ %vmovl.i.i = ashr <4 x i32> %sext, <i32 16, i32 16, i32 16, i32 16>
+ ret <4 x i32> %vmovl.i.i
+} \ No newline at end of file
diff --git a/test/CodeGen/AArch64/arm64-2014-04-28-sqshl-uqshl-i64Contant.ll b/test/CodeGen/AArch64/arm64-2014-04-28-sqshl-uqshl-i64Contant.ll
new file mode 100644
index 0000000..3949b85
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2014-04-28-sqshl-uqshl-i64Contant.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -verify-machineinstrs -march=arm64 | FileCheck %s
+
+; Check if sqshl/uqshl with constant shift amout can be selected.
+define i64 @test_vqshld_s64_i(i64 %a) {
+; CHECK-LABEL: test_vqshld_s64_i:
+; CHECK: sqshl {{d[0-9]+}}, {{d[0-9]+}}, #36
+ %1 = tail call i64 @llvm.aarch64.neon.sqshl.i64(i64 %a, i64 36)
+ ret i64 %1
+}
+
+define i64 @test_vqshld_u64_i(i64 %a) {
+; CHECK-LABEL: test_vqshld_u64_i:
+; CHECK: uqshl {{d[0-9]+}}, {{d[0-9]+}}, #36
+ %1 = tail call i64 @llvm.aarch64.neon.uqshl.i64(i64 %a, i64 36)
+ ret i64 %1
+}
+
+declare i64 @llvm.aarch64.neon.uqshl.i64(i64, i64)
+declare i64 @llvm.aarch64.neon.sqshl.i64(i64, i64)
diff --git a/test/CodeGen/AArch64/arm64-2014-04-29-EXT-undef-mask.ll b/test/CodeGen/AArch64/arm64-2014-04-29-EXT-undef-mask.ll
new file mode 100644
index 0000000..1b2d543
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-2014-04-29-EXT-undef-mask.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -O0 -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+; The following 2 test cases test shufflevector with beginning UNDEF mask.
+define <8 x i16> @test_vext_undef_traverse(<8 x i16> %in) {
+;CHECK-LABEL: test_vext_undef_traverse:
+;CHECK: {{ext.16b.*v0, #4}}
+ %vext = shufflevector <8 x i16> <i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 0, i16 0>, <8 x i16> %in, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9>
+ ret <8 x i16> %vext
+}
+
+define <8 x i16> @test_vext_undef_traverse2(<8 x i16> %in) {
+;CHECK-LABEL: test_vext_undef_traverse2:
+;CHECK: {{ext.16b.*v0, #6}}
+ %vext = shufflevector <8 x i16> %in, <8 x i16> <i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2>
+ ret <8 x i16> %vext
+}
+
+define <8 x i8> @test_vext_undef_traverse3(<8 x i8> %in) {
+;CHECK-LABEL: test_vext_undef_traverse3:
+;CHECK: {{ext.8b.*v0, #6}}
+ %vext = shufflevector <8 x i8> %in, <8 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 3, i32 4, i32 5>
+ ret <8 x i8> %vext
+}
diff --git a/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll b/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
new file mode 100644
index 0000000..c4597d5
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
@@ -0,0 +1,67 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -aarch64-simd-scalar=true -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=generic -aarch64-simd-scalar=true -asm-verbose=false | FileCheck %s -check-prefix=GENERIC
+
+define <2 x i64> @bar(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: bar:
+; CHECK: add.2d v[[REG:[0-9]+]], v0, v1
+; CHECK: add d[[REG3:[0-9]+]], d[[REG]], d1
+; CHECK: sub d[[REG2:[0-9]+]], d[[REG]], d1
+; GENERIC-LABEL: bar:
+; GENERIC: add v[[REG:[0-9]+]].2d, v0.2d, v1.2d
+; GENERIC: add d[[REG3:[0-9]+]], d[[REG]], d1
+; GENERIC: sub d[[REG2:[0-9]+]], d[[REG]], d1
+ %add = add <2 x i64> %a, %b
+ %vgetq_lane = extractelement <2 x i64> %add, i32 0
+ %vgetq_lane2 = extractelement <2 x i64> %b, i32 0
+ %add3 = add i64 %vgetq_lane, %vgetq_lane2
+ %sub = sub i64 %vgetq_lane, %vgetq_lane2
+ %vecinit = insertelement <2 x i64> undef, i64 %add3, i32 0
+ %vecinit8 = insertelement <2 x i64> %vecinit, i64 %sub, i32 1
+ ret <2 x i64> %vecinit8
+}
+
+define double @subdd_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: subdd_su64:
+; CHECK: sub d0, d1, d0
+; CHECK-NEXT: ret
+; GENERIC-LABEL: subdd_su64:
+; GENERIC: sub d0, d1, d0
+; GENERIC-NEXT: ret
+ %vecext = extractelement <2 x i64> %a, i32 0
+ %vecext1 = extractelement <2 x i64> %b, i32 0
+ %sub.i = sub nsw i64 %vecext1, %vecext
+ %retval = bitcast i64 %sub.i to double
+ ret double %retval
+}
+
+define double @vaddd_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: vaddd_su64:
+; CHECK: add d0, d1, d0
+; CHECK-NEXT: ret
+; GENERIC-LABEL: vaddd_su64:
+; GENERIC: add d0, d1, d0
+; GENERIC-NEXT: ret
+ %vecext = extractelement <2 x i64> %a, i32 0
+ %vecext1 = extractelement <2 x i64> %b, i32 0
+ %add.i = add nsw i64 %vecext1, %vecext
+ %retval = bitcast i64 %add.i to double
+ ret double %retval
+}
+
+; sub MI doesn't access dsub register.
+define double @add_sub_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: add_sub_su64:
+; CHECK: add d0, d1, d0
+; CHECK: sub d0, {{d[0-9]+}}, d0
+; CHECK-NEXT: ret
+; GENERIC-LABEL: add_sub_su64:
+; GENERIC: add d0, d1, d0
+; GENERIC: sub d0, {{d[0-9]+}}, d0
+; GENERIC-NEXT: ret
+ %vecext = extractelement <2 x i64> %a, i32 0
+ %vecext1 = extractelement <2 x i64> %b, i32 0
+ %add.i = add i64 %vecext1, %vecext
+ %sub.i = sub i64 0, %add.i
+ %retval = bitcast i64 %sub.i to double
+ ret double %retval
+}
diff --git a/test/CodeGen/AArch64/arm64-aapcs.ll b/test/CodeGen/AArch64/arm64-aapcs.ll
new file mode 100644
index 0000000..b713f0d
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-aapcs.ll
@@ -0,0 +1,103 @@
+; RUN: llc -mtriple=arm64-linux-gnu -enable-misched=false < %s | FileCheck %s
+
+@var = global i32 0, align 4
+
+define i128 @test_i128_align(i32, i128 %arg, i32 %after) {
+ store i32 %after, i32* @var, align 4
+; CHECK: str w4, [{{x[0-9]+}}, :lo12:var]
+
+ ret i128 %arg
+; CHECK: mov x0, x2
+; CHECK: mov x1, x3
+}
+
+@var64 = global i64 0, align 8
+
+ ; Check stack slots are 64-bit at all times.
+define void @test_stack_slots([8 x i32], i1 %bool, i8 %char, i16 %short,
+ i32 %int, i64 %long) {
+ ; Part of last store. Blasted scheduler.
+; CHECK: ldr [[LONG:x[0-9]+]], [sp, #32]
+
+ %ext_bool = zext i1 %bool to i64
+ store volatile i64 %ext_bool, i64* @var64, align 8
+; CHECK: ldrb w[[EXT:[0-9]+]], [sp]
+; CHECK: and x[[EXTED:[0-9]+]], x[[EXT]], #0x1
+; CHECK: str x[[EXTED]], [{{x[0-9]+}}, :lo12:var64]
+
+ %ext_char = zext i8 %char to i64
+ store volatile i64 %ext_char, i64* @var64, align 8
+; CHECK: ldrb w[[EXT:[0-9]+]], [sp, #8]
+; CHECK: str x[[EXT]], [{{x[0-9]+}}, :lo12:var64]
+
+ %ext_short = zext i16 %short to i64
+ store volatile i64 %ext_short, i64* @var64, align 8
+; CHECK: ldrh w[[EXT:[0-9]+]], [sp, #16]
+; CHECK: str x[[EXT]], [{{x[0-9]+}}, :lo12:var64]
+
+ %ext_int = zext i32 %int to i64
+ store volatile i64 %ext_int, i64* @var64, align 8
+; CHECK: ldr{{b?}} w[[EXT:[0-9]+]], [sp, #24]
+; CHECK: str x[[EXT]], [{{x[0-9]+}}, :lo12:var64]
+
+ store volatile i64 %long, i64* @var64, align 8
+; CHECK: str [[LONG]], [{{x[0-9]+}}, :lo12:var64]
+
+ ret void
+}
+
+; Make sure the callee does extensions (in the absence of zext/sext
+; keyword on args) while we're here.
+
+define void @test_extension(i1 %bool, i8 %char, i16 %short, i32 %int) {
+ %ext_bool = zext i1 %bool to i64
+ store volatile i64 %ext_bool, i64* @var64
+; CHECK: and [[EXT:x[0-9]+]], x0, #0x1
+; CHECK: str [[EXT]], [{{x[0-9]+}}, :lo12:var64]
+
+ %ext_char = sext i8 %char to i64
+ store volatile i64 %ext_char, i64* @var64
+; CHECK: sxtb [[EXT:x[0-9]+]], w1
+; CHECK: str [[EXT]], [{{x[0-9]+}}, :lo12:var64]
+
+ %ext_short = zext i16 %short to i64
+ store volatile i64 %ext_short, i64* @var64
+; CHECK: and [[EXT:x[0-9]+]], x2, #0xffff
+; CHECK: str [[EXT]], [{{x[0-9]+}}, :lo12:var64]
+
+ %ext_int = zext i32 %int to i64
+ store volatile i64 %ext_int, i64* @var64
+; CHECK: ubfx [[EXT:x[0-9]+]], x3, #0, #32
+; CHECK: str [[EXT]], [{{x[0-9]+}}, :lo12:var64]
+
+ ret void
+}
+
+declare void @variadic(i32 %a, ...)
+
+ ; Under AAPCS variadic functions have the same calling convention as
+ ; others. The extra arguments should go in registers rather than on the stack.
+define void @test_variadic() {
+ call void(i32, ...)* @variadic(i32 0, i64 1, double 2.0)
+; CHECK: fmov d0, #2.0
+; CHECK: orr w1, wzr, #0x1
+; CHECK: bl variadic
+ ret void
+}
+
+; We weren't marking x7 as used after deciding that the i128 didn't fit into
+; registers and putting the first half on the stack, so the *second* half went
+; into x7. Yuck!
+define i128 @test_i128_shadow([7 x i64] %x0_x6, i128 %sp) {
+; CHECK-LABEL: test_i128_shadow:
+; CHECK: ldp x0, x1, [sp]
+
+ ret i128 %sp
+}
+
+; This test is to check if fp128 can be correctly handled on stack.
+define fp128 @test_fp128([8 x float] %arg0, fp128 %arg1) {
+; CHECK-LABEL: test_fp128:
+; CHECK: ldr {{q[0-9]+}}, [sp]
+ ret fp128 %arg1
+}
diff --git a/test/CodeGen/AArch64/arm64-abi-varargs.ll b/test/CodeGen/AArch64/arm64-abi-varargs.ll
new file mode 100644
index 0000000..92db392
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-abi-varargs.ll
@@ -0,0 +1,191 @@
+; RUN: llc < %s -march=arm64 -mcpu=cyclone -enable-misched=false | FileCheck %s
+target triple = "arm64-apple-ios7.0.0"
+
+; rdar://13625505
+; Here we have 9 fixed integer arguments the 9th argument in on stack, the
+; varargs start right after at 8-byte alignment.
+define void @fn9(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8, i32 %a9, ...) nounwind noinline ssp {
+; CHECK-LABEL: fn9:
+; 9th fixed argument
+; CHECK: ldr {{w[0-9]+}}, [sp, #64]
+; CHECK: add [[ARGS:x[0-9]+]], sp, #72
+; CHECK: add {{x[0-9]+}}, [[ARGS]], #8
+; First vararg
+; CHECK: ldr {{w[0-9]+}}, [sp, #72]
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #8
+; Second vararg
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}]
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #8
+; Third vararg
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}]
+ %1 = alloca i32, align 4
+ %2 = alloca i32, align 4
+ %3 = alloca i32, align 4
+ %4 = alloca i32, align 4
+ %5 = alloca i32, align 4
+ %6 = alloca i32, align 4
+ %7 = alloca i32, align 4
+ %8 = alloca i32, align 4
+ %9 = alloca i32, align 4
+ %args = alloca i8*, align 8
+ %a10 = alloca i32, align 4
+ %a11 = alloca i32, align 4
+ %a12 = alloca i32, align 4
+ store i32 %a1, i32* %1, align 4
+ store i32 %a2, i32* %2, align 4
+ store i32 %a3, i32* %3, align 4
+ store i32 %a4, i32* %4, align 4
+ store i32 %a5, i32* %5, align 4
+ store i32 %a6, i32* %6, align 4
+ store i32 %a7, i32* %7, align 4
+ store i32 %a8, i32* %8, align 4
+ store i32 %a9, i32* %9, align 4
+ %10 = bitcast i8** %args to i8*
+ call void @llvm.va_start(i8* %10)
+ %11 = va_arg i8** %args, i32
+ store i32 %11, i32* %a10, align 4
+ %12 = va_arg i8** %args, i32
+ store i32 %12, i32* %a11, align 4
+ %13 = va_arg i8** %args, i32
+ store i32 %13, i32* %a12, align 4
+ ret void
+}
+
+declare void @llvm.va_start(i8*) nounwind
+
+define i32 @main() nounwind ssp {
+; CHECK-LABEL: main:
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
+; CHECK: str {{x[0-9]+}}, [sp, #8]
+; CHECK: str {{w[0-9]+}}, [sp]
+ %a1 = alloca i32, align 4
+ %a2 = alloca i32, align 4
+ %a3 = alloca i32, align 4
+ %a4 = alloca i32, align 4
+ %a5 = alloca i32, align 4
+ %a6 = alloca i32, align 4
+ %a7 = alloca i32, align 4
+ %a8 = alloca i32, align 4
+ %a9 = alloca i32, align 4
+ %a10 = alloca i32, align 4
+ %a11 = alloca i32, align 4
+ %a12 = alloca i32, align 4
+ store i32 1, i32* %a1, align 4
+ store i32 2, i32* %a2, align 4
+ store i32 3, i32* %a3, align 4
+ store i32 4, i32* %a4, align 4
+ store i32 5, i32* %a5, align 4
+ store i32 6, i32* %a6, align 4
+ store i32 7, i32* %a7, align 4
+ store i32 8, i32* %a8, align 4
+ store i32 9, i32* %a9, align 4
+ store i32 10, i32* %a10, align 4
+ store i32 11, i32* %a11, align 4
+ store i32 12, i32* %a12, align 4
+ %1 = load i32* %a1, align 4
+ %2 = load i32* %a2, align 4
+ %3 = load i32* %a3, align 4
+ %4 = load i32* %a4, align 4
+ %5 = load i32* %a5, align 4
+ %6 = load i32* %a6, align 4
+ %7 = load i32* %a7, align 4
+ %8 = load i32* %a8, align 4
+ %9 = load i32* %a9, align 4
+ %10 = load i32* %a10, align 4
+ %11 = load i32* %a11, align 4
+ %12 = load i32* %a12, align 4
+ call void (i32, i32, i32, i32, i32, i32, i32, i32, i32, ...)* @fn9(i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12)
+ ret i32 0
+}
+
+;rdar://13668483
+@.str = private unnamed_addr constant [4 x i8] c"fmt\00", align 1
+define void @foo(i8* %fmt, ...) nounwind {
+entry:
+; CHECK-LABEL: foo:
+; CHECK: orr {{x[0-9]+}}, {{x[0-9]+}}, #0x8
+; CHECK: ldr {{w[0-9]+}}, [sp, #48]
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #15
+; CHECK: and x[[ADDR:[0-9]+]], {{x[0-9]+}}, #0xfffffffffffffff0
+; CHECK: ldr {{q[0-9]+}}, [x[[ADDR]]]
+ %fmt.addr = alloca i8*, align 8
+ %args = alloca i8*, align 8
+ %vc = alloca i32, align 4
+ %vv = alloca <4 x i32>, align 16
+ store i8* %fmt, i8** %fmt.addr, align 8
+ %args1 = bitcast i8** %args to i8*
+ call void @llvm.va_start(i8* %args1)
+ %0 = va_arg i8** %args, i32
+ store i32 %0, i32* %vc, align 4
+ %1 = va_arg i8** %args, <4 x i32>
+ store <4 x i32> %1, <4 x i32>* %vv, align 16
+ ret void
+}
+
+define void @bar(i32 %x, <4 x i32> %y) nounwind {
+entry:
+; CHECK-LABEL: bar:
+; CHECK: str {{q[0-9]+}}, [sp, #16]
+; CHECK: str {{x[0-9]+}}, [sp]
+ %x.addr = alloca i32, align 4
+ %y.addr = alloca <4 x i32>, align 16
+ store i32 %x, i32* %x.addr, align 4
+ store <4 x i32> %y, <4 x i32>* %y.addr, align 16
+ %0 = load i32* %x.addr, align 4
+ %1 = load <4 x i32>* %y.addr, align 16
+ call void (i8*, ...)* @foo(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i32 %0, <4 x i32> %1)
+ ret void
+}
+
+; rdar://13668927
+; When passing 16-byte aligned small structs as vararg, make sure the caller
+; side is 16-byte aligned on stack.
+%struct.s41 = type { i32, i16, i32, i16 }
+define void @foo2(i8* %fmt, ...) nounwind {
+entry:
+; CHECK-LABEL: foo2:
+; CHECK: orr {{x[0-9]+}}, {{x[0-9]+}}, #0x8
+; CHECK: ldr {{w[0-9]+}}, [sp, #48]
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #15
+; CHECK: and x[[ADDR:[0-9]+]], {{x[0-9]+}}, #0xfffffffffffffff0
+; CHECK: ldr {{q[0-9]+}}, [x[[ADDR]]]
+ %fmt.addr = alloca i8*, align 8
+ %args = alloca i8*, align 8
+ %vc = alloca i32, align 4
+ %vs = alloca %struct.s41, align 16
+ store i8* %fmt, i8** %fmt.addr, align 8
+ %args1 = bitcast i8** %args to i8*
+ call void @llvm.va_start(i8* %args1)
+ %0 = va_arg i8** %args, i32
+ store i32 %0, i32* %vc, align 4
+ %ap.cur = load i8** %args
+ %1 = getelementptr i8* %ap.cur, i32 15
+ %2 = ptrtoint i8* %1 to i64
+ %3 = and i64 %2, -16
+ %ap.align = inttoptr i64 %3 to i8*
+ %ap.next = getelementptr i8* %ap.align, i32 16
+ store i8* %ap.next, i8** %args
+ %4 = bitcast i8* %ap.align to %struct.s41*
+ %5 = bitcast %struct.s41* %vs to i8*
+ %6 = bitcast %struct.s41* %4 to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* %5, i8* %6, i64 16, i32 16, i1 false)
+ ret void
+}
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+
+define void @bar2(i32 %x, i128 %s41.coerce) nounwind {
+entry:
+; CHECK-LABEL: bar2:
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
+; CHECK: str {{x[0-9]+}}, [sp]
+ %x.addr = alloca i32, align 4
+ %s41 = alloca %struct.s41, align 16
+ store i32 %x, i32* %x.addr, align 4
+ %0 = bitcast %struct.s41* %s41 to i128*
+ store i128 %s41.coerce, i128* %0, align 1
+ %1 = load i32* %x.addr, align 4
+ %2 = bitcast %struct.s41* %s41 to i128*
+ %3 = load i128* %2, align 1
+ call void (i8*, ...)* @foo2(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i32 %1, i128 %3)
+ ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-abi.ll b/test/CodeGen/AArch64/arm64-abi.ll
new file mode 100644
index 0000000..e2de434
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-abi.ll
@@ -0,0 +1,238 @@
+; RUN: llc < %s -march=arm64 -mcpu=cyclone -enable-misched=false | FileCheck %s
+; RUN: llc < %s -O0 | FileCheck -check-prefix=FAST %s
+target triple = "arm64-apple-darwin"
+
+; rdar://9932559
+define i64 @i8i16callee(i64 %a1, i64 %a2, i64 %a3, i8 signext %a4, i16 signext %a5, i64 %a6, i64 %a7, i64 %a8, i8 signext %b1, i16 signext %b2, i8 signext %b3, i8 signext %b4) nounwind readnone noinline {
+entry:
+; CHECK-LABEL: i8i16callee:
+; The 8th, 9th, 10th and 11th arguments are passed at sp, sp+2, sp+4, sp+5.
+; They are i8, i16, i8 and i8.
+; CHECK: ldrsb {{w[0-9]+}}, [sp, #5]
+; CHECK: ldrsh {{w[0-9]+}}, [sp, #2]
+; CHECK: ldrsb {{w[0-9]+}}, [sp]
+; CHECK: ldrsb {{w[0-9]+}}, [sp, #4]
+; FAST-LABEL: i8i16callee:
+; FAST: ldrb {{w[0-9]+}}, [sp, #5]
+; FAST: ldrb {{w[0-9]+}}, [sp, #4]
+; FAST: ldrh {{w[0-9]+}}, [sp, #2]
+; FAST: ldrb {{w[0-9]+}}, [sp]
+ %conv = sext i8 %a4 to i64
+ %conv3 = sext i16 %a5 to i64
+ %conv8 = sext i8 %b1 to i64
+ %conv9 = sext i16 %b2 to i64
+ %conv11 = sext i8 %b3 to i64
+ %conv13 = sext i8 %b4 to i64
+ %add10 = add i64 %a2, %a1
+ %add12 = add i64 %add10, %a3
+ %add14 = add i64 %add12, %conv
+ %add = add i64 %add14, %conv3
+ %add1 = add i64 %add, %a6
+ %add2 = add i64 %add1, %a7
+ %add4 = add i64 %add2, %a8
+ %add5 = add i64 %add4, %conv8
+ %add6 = add i64 %add5, %conv9
+ %add7 = add i64 %add6, %conv11
+ %add15 = add i64 %add7, %conv13
+ %sext = shl i64 %add15, 32
+ %conv17 = ashr exact i64 %sext, 32
+ ret i64 %conv17
+}
+
+define i32 @i8i16caller() nounwind readnone {
+entry:
+; CHECK: i8i16caller
+; The 8th, 9th, 10th and 11th arguments are passed at sp, sp+2, sp+4, sp+5.
+; They are i8, i16, i8 and i8.
+; CHECK: strb {{w[0-9]+}}, [sp, #5]
+; CHECK: strb {{w[0-9]+}}, [sp, #4]
+; CHECK: strh {{w[0-9]+}}, [sp, #2]
+; CHECK: strb {{w[0-9]+}}, [sp]
+; CHECK: bl
+; FAST: i8i16caller
+; FAST: strb {{w[0-9]+}}, [sp]
+; FAST: strh {{w[0-9]+}}, [sp, #2]
+; FAST: strb {{w[0-9]+}}, [sp, #4]
+; FAST: strb {{w[0-9]+}}, [sp, #5]
+; FAST: bl
+ %call = tail call i64 @i8i16callee(i64 0, i64 1, i64 2, i8 signext 3, i16 signext 4, i64 5, i64 6, i64 7, i8 signext 97, i16 signext 98, i8 signext 99, i8 signext 100)
+ %conv = trunc i64 %call to i32
+ ret i32 %conv
+}
+
+; rdar://12651543
+define double @circle_center([2 x float] %a) nounwind ssp {
+ %call = tail call double @ext([2 x float] %a) nounwind
+; CHECK: circle_center
+; CHECK: bl
+ ret double %call
+}
+declare double @ext([2 x float])
+
+; rdar://12656141
+; 16-byte vector should be aligned at 16-byte when passing on stack.
+; A double argument will be passed on stack, so vecotr should be at sp+16.
+define double @fixed_4i(<4 x i32>* nocapture %in) nounwind {
+entry:
+; CHECK: fixed_4i
+; CHECK: str [[REG_1:q[0-9]+]], [sp, #16]
+; FAST: fixed_4i
+; FAST: sub sp, sp, #64
+; FAST: mov x[[ADDR:[0-9]+]], sp
+; FAST: str [[REG_1:q[0-9]+]], [x[[ADDR]], #16]
+ %0 = load <4 x i32>* %in, align 16
+ %call = tail call double @args_vec_4i(double 3.000000e+00, <4 x i32> %0, <4 x i32> %0, <4 x i32> %0, <4 x i32> %0, <4 x i32> %0, <4 x i32> %0, <4 x i32> %0, double 3.000000e+00, <4 x i32> %0, i8 signext 3)
+ ret double %call
+}
+declare double @args_vec_4i(double, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, double, <4 x i32>, i8 signext)
+
+; rdar://12695237
+; d8 at sp, i in register w0.
+@g_d = common global double 0.000000e+00, align 8
+define void @test1(float %f1, double %d1, double %d2, double %d3, double %d4,
+ double %d5, double %d6, double %d7, double %d8, i32 %i) nounwind ssp {
+entry:
+; CHECK: test1
+; CHECK: ldr [[REG_1:d[0-9]+]], [sp]
+; CHECK: scvtf [[REG_2:s[0-9]+]], w0
+; CHECK: fadd s0, [[REG_2]], s0
+ %conv = sitofp i32 %i to float
+ %add = fadd float %conv, %f1
+ %conv1 = fpext float %add to double
+ %add2 = fadd double %conv1, %d7
+ %add3 = fadd double %add2, %d8
+ store double %add3, double* @g_d, align 8
+ ret void
+}
+
+; i9 at sp, d1 in register s0.
+define void @test2(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
+ i32 %i7, i32 %i8, i32 %i9, float %d1) nounwind ssp {
+entry:
+; CHECK: test2
+; CHECK: scvtf [[REG_2:s[0-9]+]], w0
+; CHECK: fadd s0, [[REG_2]], s0
+; CHECK: ldr [[REG_1:s[0-9]+]], [sp]
+ %conv = sitofp i32 %i1 to float
+ %add = fadd float %conv, %d1
+ %conv1 = fpext float %add to double
+ %conv2 = sitofp i32 %i8 to double
+ %add3 = fadd double %conv2, %conv1
+ %conv4 = sitofp i32 %i9 to double
+ %add5 = fadd double %conv4, %add3
+ store double %add5, double* @g_d, align 8
+ ret void
+}
+
+; rdar://12648441
+; Check alignment on stack for v64, f64, i64, f32, i32.
+define double @test3(<2 x i32>* nocapture %in) nounwind {
+entry:
+; CHECK: test3
+; CHECK: str [[REG_1:d[0-9]+]], [sp, #8]
+; FAST: test3
+; FAST: sub sp, sp, #32
+; FAST: mov x[[ADDR:[0-9]+]], sp
+; FAST: str [[REG_1:d[0-9]+]], [x[[ADDR]], #8]
+ %0 = load <2 x i32>* %in, align 8
+ %call = tail call double @args_vec_2i(double 3.000000e+00, <2 x i32> %0,
+ <2 x i32> %0, <2 x i32> %0, <2 x i32> %0, <2 x i32> %0, <2 x i32> %0,
+ <2 x i32> %0, float 3.000000e+00, <2 x i32> %0, i8 signext 3)
+ ret double %call
+}
+declare double @args_vec_2i(double, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>,
+ <2 x i32>, <2 x i32>, <2 x i32>, float, <2 x i32>, i8 signext)
+
+define double @test4(double* nocapture %in) nounwind {
+entry:
+; CHECK: test4
+; CHECK: str [[REG_1:d[0-9]+]], [sp, #8]
+; CHECK: str [[REG_2:w[0-9]+]], [sp]
+; CHECK: orr w0, wzr, #0x3
+ %0 = load double* %in, align 8
+ %call = tail call double @args_f64(double 3.000000e+00, double %0, double %0,
+ double %0, double %0, double %0, double %0, double %0,
+ float 3.000000e+00, double %0, i8 signext 3)
+ ret double %call
+}
+declare double @args_f64(double, double, double, double, double, double, double,
+ double, float, double, i8 signext)
+
+define i64 @test5(i64* nocapture %in) nounwind {
+entry:
+; CHECK: test5
+; CHECK: strb [[REG_3:w[0-9]+]], [sp, #16]
+; CHECK: str [[REG_1:x[0-9]+]], [sp, #8]
+; CHECK: str [[REG_2:w[0-9]+]], [sp]
+ %0 = load i64* %in, align 8
+ %call = tail call i64 @args_i64(i64 3, i64 %0, i64 %0, i64 %0, i64 %0, i64 %0,
+ i64 %0, i64 %0, i32 3, i64 %0, i8 signext 3)
+ ret i64 %call
+}
+declare i64 @args_i64(i64, i64, i64, i64, i64, i64, i64, i64, i32, i64,
+ i8 signext)
+
+define i32 @test6(float* nocapture %in) nounwind {
+entry:
+; CHECK: test6
+; CHECK: strb [[REG_2:w[0-9]+]], [sp, #8]
+; CHECK: str [[REG_1:s[0-9]+]], [sp, #4]
+; CHECK: strh [[REG_3:w[0-9]+]], [sp]
+ %0 = load float* %in, align 4
+ %call = tail call i32 @args_f32(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6,
+ i32 7, i32 8, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0,
+ float 6.0, float 7.0, float 8.0, i16 signext 3, float %0,
+ i8 signext 3)
+ ret i32 %call
+}
+declare i32 @args_f32(i32, i32, i32, i32, i32, i32, i32, i32,
+ float, float, float, float, float, float, float, float,
+ i16 signext, float, i8 signext)
+
+define i32 @test7(i32* nocapture %in) nounwind {
+entry:
+; CHECK: test7
+; CHECK: strb [[REG_2:w[0-9]+]], [sp, #8]
+; CHECK: str [[REG_1:w[0-9]+]], [sp, #4]
+; CHECK: strh [[REG_3:w[0-9]+]], [sp]
+ %0 = load i32* %in, align 4
+ %call = tail call i32 @args_i32(i32 3, i32 %0, i32 %0, i32 %0, i32 %0, i32 %0,
+ i32 %0, i32 %0, i16 signext 3, i32 %0, i8 signext 4)
+ ret i32 %call
+}
+declare i32 @args_i32(i32, i32, i32, i32, i32, i32, i32, i32, i16 signext, i32,
+ i8 signext)
+
+define i32 @test8(i32 %argc, i8** nocapture %argv) nounwind {
+entry:
+; CHECK: test8
+; CHECK: strb {{w[0-9]+}}, [sp, #3]
+; CHECK: strb wzr, [sp, #2]
+; CHECK: strb {{w[0-9]+}}, [sp, #1]
+; CHECK: strb wzr, [sp]
+; CHECK: bl
+; FAST: test8
+; FAST: strb {{w[0-9]+}}, [sp]
+; FAST: strb {{w[0-9]+}}, [sp, #1]
+; FAST: strb {{w[0-9]+}}, [sp, #2]
+; FAST: strb {{w[0-9]+}}, [sp, #3]
+; FAST: bl
+ tail call void @args_i1(i1 zeroext false, i1 zeroext true, i1 zeroext false,
+ i1 zeroext true, i1 zeroext false, i1 zeroext true,
+ i1 zeroext false, i1 zeroext true, i1 zeroext false,
+ i1 zeroext true, i1 zeroext false, i1 zeroext true)
+ ret i32 0
+}
+
+declare void @args_i1(i1 zeroext, i1 zeroext, i1 zeroext, i1 zeroext,
+ i1 zeroext, i1 zeroext, i1 zeroext, i1 zeroext,
+ i1 zeroext, i1 zeroext, i1 zeroext, i1 zeroext)
+
+define i32 @i1_stack_incoming(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f,
+ i64 %g, i64 %h, i64 %i, i1 zeroext %j) {
+; CHECK-LABEL: i1_stack_incoming:
+; CHECK: ldrb w0, [sp, #8]
+; CHECK: ret
+ %v = zext i1 %j to i32
+ ret i32 %v
+}
diff --git a/test/CodeGen/AArch64/arm64-abi_align.ll b/test/CodeGen/AArch64/arm64-abi_align.ll
new file mode 100644
index 0000000..44c5a07
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-abi_align.ll
@@ -0,0 +1,532 @@
+; RUN: llc < %s -march=arm64 -mcpu=cyclone -enable-misched=false | FileCheck %s
+; RUN: llc < %s -O0 | FileCheck -check-prefix=FAST %s
+target triple = "arm64-apple-darwin"
+
+; rdar://12648441
+; Generated from arm64-arguments.c with -O2.
+; Test passing structs with size < 8, < 16 and > 16
+; with alignment of 16 and without
+
+; Structs with size < 8
+%struct.s38 = type { i32, i16 }
+; With alignment of 16, the size will be padded to multiple of 16 bytes.
+%struct.s39 = type { i32, i16, [10 x i8] }
+; Structs with size < 16
+%struct.s40 = type { i32, i16, i32, i16 }
+%struct.s41 = type { i32, i16, i32, i16 }
+; Structs with size > 16
+%struct.s42 = type { i32, i16, i32, i16, i32, i16 }
+%struct.s43 = type { i32, i16, i32, i16, i32, i16, [10 x i8] }
+
+@g38 = common global %struct.s38 zeroinitializer, align 4
+@g38_2 = common global %struct.s38 zeroinitializer, align 4
+@g39 = common global %struct.s39 zeroinitializer, align 16
+@g39_2 = common global %struct.s39 zeroinitializer, align 16
+@g40 = common global %struct.s40 zeroinitializer, align 4
+@g40_2 = common global %struct.s40 zeroinitializer, align 4
+@g41 = common global %struct.s41 zeroinitializer, align 16
+@g41_2 = common global %struct.s41 zeroinitializer, align 16
+@g42 = common global %struct.s42 zeroinitializer, align 4
+@g42_2 = common global %struct.s42 zeroinitializer, align 4
+@g43 = common global %struct.s43 zeroinitializer, align 16
+@g43_2 = common global %struct.s43 zeroinitializer, align 16
+
+; structs with size < 8 bytes, passed via i64 in x1 and x2
+define i32 @f38(i32 %i, i64 %s1.coerce, i64 %s2.coerce) #0 {
+entry:
+; CHECK: f38
+; CHECK: add w[[A:[0-9]+]], w1, w0
+; CHECK: add {{w[0-9]+}}, w[[A]], w2
+ %s1.sroa.0.0.extract.trunc = trunc i64 %s1.coerce to i32
+ %s1.sroa.1.4.extract.shift = lshr i64 %s1.coerce, 32
+ %s2.sroa.0.0.extract.trunc = trunc i64 %s2.coerce to i32
+ %s2.sroa.1.4.extract.shift = lshr i64 %s2.coerce, 32
+ %sext8 = shl nuw nsw i64 %s1.sroa.1.4.extract.shift, 16
+ %sext = trunc i64 %sext8 to i32
+ %conv = ashr exact i32 %sext, 16
+ %sext1011 = shl nuw nsw i64 %s2.sroa.1.4.extract.shift, 16
+ %sext10 = trunc i64 %sext1011 to i32
+ %conv6 = ashr exact i32 %sext10, 16
+ %add = add i32 %s1.sroa.0.0.extract.trunc, %i
+ %add3 = add i32 %add, %s2.sroa.0.0.extract.trunc
+ %add4 = add i32 %add3, %conv
+ %add7 = add i32 %add4, %conv6
+ ret i32 %add7
+}
+
+define i32 @caller38() #1 {
+entry:
+; CHECK: caller38
+; CHECK: ldr x1,
+; CHECK: ldr x2,
+ %0 = load i64* bitcast (%struct.s38* @g38 to i64*), align 4
+ %1 = load i64* bitcast (%struct.s38* @g38_2 to i64*), align 4
+ %call = tail call i32 @f38(i32 3, i64 %0, i64 %1) #5
+ ret i32 %call
+}
+
+declare i32 @f38_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
+ i32 %i7, i32 %i8, i32 %i9, i64 %s1.coerce, i64 %s2.coerce) #0
+
+; structs with size < 8 bytes, passed on stack at [sp+8] and [sp+16]
+; i9 at [sp]
+define i32 @caller38_stack() #1 {
+entry:
+; CHECK: caller38_stack
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #8]
+; CHECK: movz w[[C:[0-9]+]], #0x9
+; CHECK: str w[[C]], [sp]
+ %0 = load i64* bitcast (%struct.s38* @g38 to i64*), align 4
+ %1 = load i64* bitcast (%struct.s38* @g38_2 to i64*), align 4
+ %call = tail call i32 @f38_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6,
+ i32 7, i32 8, i32 9, i64 %0, i64 %1) #5
+ ret i32 %call
+}
+
+; structs with size < 8 bytes, alignment of 16
+; passed via i128 in x1 and x3
+define i32 @f39(i32 %i, i128 %s1.coerce, i128 %s2.coerce) #0 {
+entry:
+; CHECK: f39
+; CHECK: add w[[A:[0-9]+]], w1, w0
+; CHECK: add {{w[0-9]+}}, w[[A]], w3
+ %s1.sroa.0.0.extract.trunc = trunc i128 %s1.coerce to i32
+ %s1.sroa.1.4.extract.shift = lshr i128 %s1.coerce, 32
+ %s2.sroa.0.0.extract.trunc = trunc i128 %s2.coerce to i32
+ %s2.sroa.1.4.extract.shift = lshr i128 %s2.coerce, 32
+ %sext8 = shl nuw nsw i128 %s1.sroa.1.4.extract.shift, 16
+ %sext = trunc i128 %sext8 to i32
+ %conv = ashr exact i32 %sext, 16
+ %sext1011 = shl nuw nsw i128 %s2.sroa.1.4.extract.shift, 16
+ %sext10 = trunc i128 %sext1011 to i32
+ %conv6 = ashr exact i32 %sext10, 16
+ %add = add i32 %s1.sroa.0.0.extract.trunc, %i
+ %add3 = add i32 %add, %s2.sroa.0.0.extract.trunc
+ %add4 = add i32 %add3, %conv
+ %add7 = add i32 %add4, %conv6
+ ret i32 %add7
+}
+
+define i32 @caller39() #1 {
+entry:
+; CHECK: caller39
+; CHECK: ldp x1, x2,
+; CHECK: ldp x3, x4,
+ %0 = load i128* bitcast (%struct.s39* @g39 to i128*), align 16
+ %1 = load i128* bitcast (%struct.s39* @g39_2 to i128*), align 16
+ %call = tail call i32 @f39(i32 3, i128 %0, i128 %1) #5
+ ret i32 %call
+}
+
+declare i32 @f39_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
+ i32 %i7, i32 %i8, i32 %i9, i128 %s1.coerce, i128 %s2.coerce) #0
+
+; structs with size < 8 bytes, alignment 16
+; passed on stack at [sp+16] and [sp+32]
+define i32 @caller39_stack() #1 {
+entry:
+; CHECK: caller39_stack
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #32]
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
+; CHECK: movz w[[C:[0-9]+]], #0x9
+; CHECK: str w[[C]], [sp]
+ %0 = load i128* bitcast (%struct.s39* @g39 to i128*), align 16
+ %1 = load i128* bitcast (%struct.s39* @g39_2 to i128*), align 16
+ %call = tail call i32 @f39_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6,
+ i32 7, i32 8, i32 9, i128 %0, i128 %1) #5
+ ret i32 %call
+}
+
+; structs with size < 16 bytes
+; passed via i128 in x1 and x3
+define i32 @f40(i32 %i, [2 x i64] %s1.coerce, [2 x i64] %s2.coerce) #0 {
+entry:
+; CHECK: f40
+; CHECK: add w[[A:[0-9]+]], w1, w0
+; CHECK: add {{w[0-9]+}}, w[[A]], w3
+ %s1.coerce.fca.0.extract = extractvalue [2 x i64] %s1.coerce, 0
+ %s2.coerce.fca.0.extract = extractvalue [2 x i64] %s2.coerce, 0
+ %s1.sroa.0.0.extract.trunc = trunc i64 %s1.coerce.fca.0.extract to i32
+ %s2.sroa.0.0.extract.trunc = trunc i64 %s2.coerce.fca.0.extract to i32
+ %s1.sroa.0.4.extract.shift = lshr i64 %s1.coerce.fca.0.extract, 32
+ %sext8 = shl nuw nsw i64 %s1.sroa.0.4.extract.shift, 16
+ %sext = trunc i64 %sext8 to i32
+ %conv = ashr exact i32 %sext, 16
+ %s2.sroa.0.4.extract.shift = lshr i64 %s2.coerce.fca.0.extract, 32
+ %sext1011 = shl nuw nsw i64 %s2.sroa.0.4.extract.shift, 16
+ %sext10 = trunc i64 %sext1011 to i32
+ %conv6 = ashr exact i32 %sext10, 16
+ %add = add i32 %s1.sroa.0.0.extract.trunc, %i
+ %add3 = add i32 %add, %s2.sroa.0.0.extract.trunc
+ %add4 = add i32 %add3, %conv
+ %add7 = add i32 %add4, %conv6
+ ret i32 %add7
+}
+
+define i32 @caller40() #1 {
+entry:
+; CHECK: caller40
+; CHECK: ldp x1, x2,
+; CHECK: ldp x3, x4,
+ %0 = load [2 x i64]* bitcast (%struct.s40* @g40 to [2 x i64]*), align 4
+ %1 = load [2 x i64]* bitcast (%struct.s40* @g40_2 to [2 x i64]*), align 4
+ %call = tail call i32 @f40(i32 3, [2 x i64] %0, [2 x i64] %1) #5
+ ret i32 %call
+}
+
+declare i32 @f40_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
+ i32 %i7, i32 %i8, i32 %i9, [2 x i64] %s1.coerce, [2 x i64] %s2.coerce) #0
+
+; structs with size < 16 bytes
+; passed on stack at [sp+8] and [sp+24]
+define i32 @caller40_stack() #1 {
+entry:
+; CHECK: caller40_stack
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #24]
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #8]
+; CHECK: movz w[[C:[0-9]+]], #0x9
+; CHECK: str w[[C]], [sp]
+ %0 = load [2 x i64]* bitcast (%struct.s40* @g40 to [2 x i64]*), align 4
+ %1 = load [2 x i64]* bitcast (%struct.s40* @g40_2 to [2 x i64]*), align 4
+ %call = tail call i32 @f40_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6,
+ i32 7, i32 8, i32 9, [2 x i64] %0, [2 x i64] %1) #5
+ ret i32 %call
+}
+
+; structs with size < 16 bytes, alignment of 16
+; passed via i128 in x1 and x3
+define i32 @f41(i32 %i, i128 %s1.coerce, i128 %s2.coerce) #0 {
+entry:
+; CHECK: f41
+; CHECK: add w[[A:[0-9]+]], w1, w0
+; CHECK: add {{w[0-9]+}}, w[[A]], w3
+ %s1.sroa.0.0.extract.trunc = trunc i128 %s1.coerce to i32
+ %s1.sroa.1.4.extract.shift = lshr i128 %s1.coerce, 32
+ %s2.sroa.0.0.extract.trunc = trunc i128 %s2.coerce to i32
+ %s2.sroa.1.4.extract.shift = lshr i128 %s2.coerce, 32
+ %sext8 = shl nuw nsw i128 %s1.sroa.1.4.extract.shift, 16
+ %sext = trunc i128 %sext8 to i32
+ %conv = ashr exact i32 %sext, 16
+ %sext1011 = shl nuw nsw i128 %s2.sroa.1.4.extract.shift, 16
+ %sext10 = trunc i128 %sext1011 to i32
+ %conv6 = ashr exact i32 %sext10, 16
+ %add = add i32 %s1.sroa.0.0.extract.trunc, %i
+ %add3 = add i32 %add, %s2.sroa.0.0.extract.trunc
+ %add4 = add i32 %add3, %conv
+ %add7 = add i32 %add4, %conv6
+ ret i32 %add7
+}
+
+define i32 @caller41() #1 {
+entry:
+; CHECK: caller41
+; CHECK: ldp x1, x2,
+; CHECK: ldp x3, x4,
+ %0 = load i128* bitcast (%struct.s41* @g41 to i128*), align 16
+ %1 = load i128* bitcast (%struct.s41* @g41_2 to i128*), align 16
+ %call = tail call i32 @f41(i32 3, i128 %0, i128 %1) #5
+ ret i32 %call
+}
+
+declare i32 @f41_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
+ i32 %i7, i32 %i8, i32 %i9, i128 %s1.coerce, i128 %s2.coerce) #0
+
+; structs with size < 16 bytes, alignment of 16
+; passed on stack at [sp+16] and [sp+32]
+define i32 @caller41_stack() #1 {
+entry:
+; CHECK: caller41_stack
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #32]
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
+; CHECK: movz w[[C:[0-9]+]], #0x9
+; CHECK: str w[[C]], [sp]
+ %0 = load i128* bitcast (%struct.s41* @g41 to i128*), align 16
+ %1 = load i128* bitcast (%struct.s41* @g41_2 to i128*), align 16
+ %call = tail call i32 @f41_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6,
+ i32 7, i32 8, i32 9, i128 %0, i128 %1) #5
+ ret i32 %call
+}
+
+; structs with size of 22 bytes, passed indirectly in x1 and x2
+define i32 @f42(i32 %i, %struct.s42* nocapture %s1, %struct.s42* nocapture %s2) #2 {
+entry:
+; CHECK: f42
+; CHECK: ldr w[[A:[0-9]+]], [x1]
+; CHECK: ldr w[[B:[0-9]+]], [x2]
+; CHECK: add w[[C:[0-9]+]], w[[A]], w0
+; CHECK: add {{w[0-9]+}}, w[[C]], w[[B]]
+; FAST: f42
+; FAST: ldr w[[A:[0-9]+]], [x1]
+; FAST: ldr w[[B:[0-9]+]], [x2]
+; FAST: add w[[C:[0-9]+]], w[[A]], w0
+; FAST: add {{w[0-9]+}}, w[[C]], w[[B]]
+ %i1 = getelementptr inbounds %struct.s42* %s1, i64 0, i32 0
+ %0 = load i32* %i1, align 4, !tbaa !0
+ %i2 = getelementptr inbounds %struct.s42* %s2, i64 0, i32 0
+ %1 = load i32* %i2, align 4, !tbaa !0
+ %s = getelementptr inbounds %struct.s42* %s1, i64 0, i32 1
+ %2 = load i16* %s, align 2, !tbaa !3
+ %conv = sext i16 %2 to i32
+ %s5 = getelementptr inbounds %struct.s42* %s2, i64 0, i32 1
+ %3 = load i16* %s5, align 2, !tbaa !3
+ %conv6 = sext i16 %3 to i32
+ %add = add i32 %0, %i
+ %add3 = add i32 %add, %1
+ %add4 = add i32 %add3, %conv
+ %add7 = add i32 %add4, %conv6
+ ret i32 %add7
+}
+
+; For s1, we allocate a 22-byte space, pass its address via x1
+define i32 @caller42() #3 {
+entry:
+; CHECK: caller42
+; CHECK: str {{x[0-9]+}}, [sp, #48]
+; CHECK: str {{q[0-9]+}}, [sp, #32]
+; CHECK: str {{x[0-9]+}}, [sp, #16]
+; CHECK: str {{q[0-9]+}}, [sp]
+; CHECK: add x1, sp, #32
+; CHECK: mov x2, sp
+; Space for s1 is allocated at sp+32
+; Space for s2 is allocated at sp
+
+; FAST: caller42
+; FAST: sub sp, sp, #96
+; Space for s1 is allocated at fp-24 = sp+72
+; Space for s2 is allocated at sp+48
+; FAST: sub x[[A:[0-9]+]], x29, #24
+; FAST: add x[[A:[0-9]+]], sp, #48
+; Call memcpy with size = 24 (0x18)
+; FAST: orr {{x[0-9]+}}, xzr, #0x18
+ %tmp = alloca %struct.s42, align 4
+ %tmp1 = alloca %struct.s42, align 4
+ %0 = bitcast %struct.s42* %tmp to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.s42* @g42 to i8*), i64 24, i32 4, i1 false), !tbaa.struct !4
+ %1 = bitcast %struct.s42* %tmp1 to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.s42* @g42_2 to i8*), i64 24, i32 4, i1 false), !tbaa.struct !4
+ %call = call i32 @f42(i32 3, %struct.s42* %tmp, %struct.s42* %tmp1) #5
+ ret i32 %call
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) #4
+
+declare i32 @f42_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
+ i32 %i7, i32 %i8, i32 %i9, %struct.s42* nocapture %s1,
+ %struct.s42* nocapture %s2) #2
+
+define i32 @caller42_stack() #3 {
+entry:
+; CHECK: caller42_stack
+; CHECK: mov x29, sp
+; CHECK: sub sp, sp, #96
+; CHECK: stur {{x[0-9]+}}, [x29, #-16]
+; CHECK: stur {{q[0-9]+}}, [x29, #-32]
+; CHECK: str {{x[0-9]+}}, [sp, #48]
+; CHECK: str {{q[0-9]+}}, [sp, #32]
+; Space for s1 is allocated at x29-32 = sp+64
+; Space for s2 is allocated at sp+32
+; CHECK: add x[[B:[0-9]+]], sp, #32
+; CHECK: str x[[B]], [sp, #16]
+; CHECK: sub x[[A:[0-9]+]], x29, #32
+; Address of s1 is passed on stack at sp+8
+; CHECK: str x[[A]], [sp, #8]
+; CHECK: movz w[[C:[0-9]+]], #0x9
+; CHECK: str w[[C]], [sp]
+
+; FAST: caller42_stack
+; Space for s1 is allocated at fp-24
+; Space for s2 is allocated at fp-48
+; FAST: sub x[[A:[0-9]+]], x29, #24
+; FAST: sub x[[B:[0-9]+]], x29, #48
+; Call memcpy with size = 24 (0x18)
+; FAST: orr {{x[0-9]+}}, xzr, #0x18
+; FAST: str {{w[0-9]+}}, [sp]
+; Address of s1 is passed on stack at sp+8
+; FAST: str {{x[0-9]+}}, [sp, #8]
+; FAST: str {{x[0-9]+}}, [sp, #16]
+ %tmp = alloca %struct.s42, align 4
+ %tmp1 = alloca %struct.s42, align 4
+ %0 = bitcast %struct.s42* %tmp to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.s42* @g42 to i8*), i64 24, i32 4, i1 false), !tbaa.struct !4
+ %1 = bitcast %struct.s42* %tmp1 to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.s42* @g42_2 to i8*), i64 24, i32 4, i1 false), !tbaa.struct !4
+ %call = call i32 @f42_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+ i32 8, i32 9, %struct.s42* %tmp, %struct.s42* %tmp1) #5
+ ret i32 %call
+}
+
+; structs with size of 22 bytes, alignment of 16
+; passed indirectly in x1 and x2
+define i32 @f43(i32 %i, %struct.s43* nocapture %s1, %struct.s43* nocapture %s2) #2 {
+entry:
+; CHECK: f43
+; CHECK: ldr w[[A:[0-9]+]], [x1]
+; CHECK: ldr w[[B:[0-9]+]], [x2]
+; CHECK: add w[[C:[0-9]+]], w[[A]], w0
+; CHECK: add {{w[0-9]+}}, w[[C]], w[[B]]
+; FAST: f43
+; FAST: ldr w[[A:[0-9]+]], [x1]
+; FAST: ldr w[[B:[0-9]+]], [x2]
+; FAST: add w[[C:[0-9]+]], w[[A]], w0
+; FAST: add {{w[0-9]+}}, w[[C]], w[[B]]
+ %i1 = getelementptr inbounds %struct.s43* %s1, i64 0, i32 0
+ %0 = load i32* %i1, align 4, !tbaa !0
+ %i2 = getelementptr inbounds %struct.s43* %s2, i64 0, i32 0
+ %1 = load i32* %i2, align 4, !tbaa !0
+ %s = getelementptr inbounds %struct.s43* %s1, i64 0, i32 1
+ %2 = load i16* %s, align 2, !tbaa !3
+ %conv = sext i16 %2 to i32
+ %s5 = getelementptr inbounds %struct.s43* %s2, i64 0, i32 1
+ %3 = load i16* %s5, align 2, !tbaa !3
+ %conv6 = sext i16 %3 to i32
+ %add = add i32 %0, %i
+ %add3 = add i32 %add, %1
+ %add4 = add i32 %add3, %conv
+ %add7 = add i32 %add4, %conv6
+ ret i32 %add7
+}
+
+define i32 @caller43() #3 {
+entry:
+; CHECK: caller43
+; CHECK: str {{q[0-9]+}}, [sp, #48]
+; CHECK: str {{q[0-9]+}}, [sp, #32]
+; CHECK: str {{q[0-9]+}}, [sp, #16]
+; CHECK: str {{q[0-9]+}}, [sp]
+; CHECK: add x1, sp, #32
+; CHECK: mov x2, sp
+; Space for s1 is allocated at sp+32
+; Space for s2 is allocated at sp
+
+; FAST: caller43
+; FAST: mov x29, sp
+; Space for s1 is allocated at sp+32
+; Space for s2 is allocated at sp
+; FAST: add x1, sp, #32
+; FAST: mov x2, sp
+; FAST: str {{x[0-9]+}}, [sp, #32]
+; FAST: str {{x[0-9]+}}, [sp, #40]
+; FAST: str {{x[0-9]+}}, [sp, #48]
+; FAST: str {{x[0-9]+}}, [sp, #56]
+; FAST: str {{x[0-9]+}}, [sp]
+; FAST: str {{x[0-9]+}}, [sp, #8]
+; FAST: str {{x[0-9]+}}, [sp, #16]
+; FAST: str {{x[0-9]+}}, [sp, #24]
+ %tmp = alloca %struct.s43, align 16
+ %tmp1 = alloca %struct.s43, align 16
+ %0 = bitcast %struct.s43* %tmp to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.s43* @g43 to i8*), i64 32, i32 16, i1 false), !tbaa.struct !4
+ %1 = bitcast %struct.s43* %tmp1 to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.s43* @g43_2 to i8*), i64 32, i32 16, i1 false), !tbaa.struct !4
+ %call = call i32 @f43(i32 3, %struct.s43* %tmp, %struct.s43* %tmp1) #5
+ ret i32 %call
+}
+
+declare i32 @f43_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
+ i32 %i7, i32 %i8, i32 %i9, %struct.s43* nocapture %s1,
+ %struct.s43* nocapture %s2) #2
+
+define i32 @caller43_stack() #3 {
+entry:
+; CHECK: caller43_stack
+; CHECK: mov x29, sp
+; CHECK: sub sp, sp, #96
+; CHECK: stur {{q[0-9]+}}, [x29, #-16]
+; CHECK: stur {{q[0-9]+}}, [x29, #-32]
+; CHECK: str {{q[0-9]+}}, [sp, #48]
+; CHECK: str {{q[0-9]+}}, [sp, #32]
+; Space for s1 is allocated at x29-32 = sp+64
+; Space for s2 is allocated at sp+32
+; CHECK: add x[[B:[0-9]+]], sp, #32
+; CHECK: str x[[B]], [sp, #16]
+; CHECK: sub x[[A:[0-9]+]], x29, #32
+; Address of s1 is passed on stack at sp+8
+; CHECK: str x[[A]], [sp, #8]
+; CHECK: movz w[[C:[0-9]+]], #0x9
+; CHECK: str w[[C]], [sp]
+
+; FAST: caller43_stack
+; FAST: sub sp, sp, #96
+; Space for s1 is allocated at fp-32 = sp+64
+; Space for s2 is allocated at sp+32
+; FAST: sub x[[A:[0-9]+]], x29, #32
+; FAST: add x[[B:[0-9]+]], sp, #32
+; FAST: stur {{x[0-9]+}}, [x29, #-32]
+; FAST: stur {{x[0-9]+}}, [x29, #-24]
+; FAST: stur {{x[0-9]+}}, [x29, #-16]
+; FAST: stur {{x[0-9]+}}, [x29, #-8]
+; FAST: str {{x[0-9]+}}, [sp, #32]
+; FAST: str {{x[0-9]+}}, [sp, #40]
+; FAST: str {{x[0-9]+}}, [sp, #48]
+; FAST: str {{x[0-9]+}}, [sp, #56]
+; FAST: str {{w[0-9]+}}, [sp]
+; Address of s1 is passed on stack at sp+8
+; FAST: str {{x[0-9]+}}, [sp, #8]
+; FAST: str {{x[0-9]+}}, [sp, #16]
+ %tmp = alloca %struct.s43, align 16
+ %tmp1 = alloca %struct.s43, align 16
+ %0 = bitcast %struct.s43* %tmp to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.s43* @g43 to i8*), i64 32, i32 16, i1 false), !tbaa.struct !4
+ %1 = bitcast %struct.s43* %tmp1 to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.s43* @g43_2 to i8*), i64 32, i32 16, i1 false), !tbaa.struct !4
+ %call = call i32 @f43_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+ i32 8, i32 9, %struct.s43* %tmp, %struct.s43* %tmp1) #5
+ ret i32 %call
+}
+
+; rdar://13668927
+; Check that we don't split an i128.
+declare i32 @callee_i128_split(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5,
+ i32 %i6, i32 %i7, i128 %s1, i32 %i8)
+
+define i32 @i128_split() {
+entry:
+; CHECK: i128_split
+; "i128 %0" should be on stack at [sp].
+; "i32 8" should be on stack at [sp, #16].
+; CHECK: str {{w[0-9]+}}, [sp, #16]
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp]
+; FAST: i128_split
+; FAST: sub sp, sp, #48
+; FAST: mov x[[ADDR:[0-9]+]], sp
+; FAST: str {{w[0-9]+}}, [x[[ADDR]], #16]
+; Load/Store opt is disabled with -O0, so the i128 is split.
+; FAST: str {{x[0-9]+}}, [x[[ADDR]], #8]
+; FAST: str {{x[0-9]+}}, [x[[ADDR]]]
+ %0 = load i128* bitcast (%struct.s41* @g41 to i128*), align 16
+ %call = tail call i32 @callee_i128_split(i32 1, i32 2, i32 3, i32 4, i32 5,
+ i32 6, i32 7, i128 %0, i32 8) #5
+ ret i32 %call
+}
+
+declare i32 @callee_i64(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5,
+ i32 %i6, i32 %i7, i64 %s1, i32 %i8)
+
+define i32 @i64_split() {
+entry:
+; CHECK: i64_split
+; "i64 %0" should be in register x7.
+; "i32 8" should be on stack at [sp].
+; CHECK: ldr x7, [{{x[0-9]+}}]
+; CHECK: str {{w[0-9]+}}, [sp]
+; FAST: i64_split
+; FAST: ldr x7, [{{x[0-9]+}}]
+; FAST: str {{w[0-9]+}}, [sp]
+ %0 = load i64* bitcast (%struct.s41* @g41 to i64*), align 16
+ %call = tail call i32 @callee_i64(i32 1, i32 2, i32 3, i32 4, i32 5,
+ i32 6, i32 7, i64 %0, i32 8) #5
+ ret i32 %call
+}
+
+attributes #0 = { noinline nounwind readnone "fp-contract-model"="standard" "relocation-model"="pic" "ssp-buffers-size"="8" }
+attributes #1 = { nounwind readonly "fp-contract-model"="standard" "relocation-model"="pic" "ssp-buffers-size"="8" }
+attributes #2 = { noinline nounwind readonly "fp-contract-model"="standard" "relocation-model"="pic" "ssp-buffers-size"="8" }
+attributes #3 = { nounwind "fp-contract-model"="standard" "relocation-model"="pic" "ssp-buffers-size"="8" }
+attributes #4 = { nounwind }
+attributes #5 = { nobuiltin }
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!3 = metadata !{metadata !"short", metadata !1}
+!4 = metadata !{i64 0, i64 4, metadata !0, i64 4, i64 2, metadata !3, i64 8, i64 4, metadata !0, i64 12, i64 2, metadata !3, i64 16, i64 4, metadata !0, i64 20, i64 2, metadata !3}
diff --git a/test/CodeGen/AArch64/arm64-addp.ll b/test/CodeGen/AArch64/arm64-addp.ll
new file mode 100644
index 0000000..3f1e5c5
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-addp.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -mcpu=cyclone | FileCheck %s
+
+define double @foo(<2 x double> %a) nounwind {
+; CHECK-LABEL: foo:
+; CHECK: faddp.2d d0, v0
+; CHECK-NEXT: ret
+ %lane0.i = extractelement <2 x double> %a, i32 0
+ %lane1.i = extractelement <2 x double> %a, i32 1
+ %vpaddd.i = fadd double %lane0.i, %lane1.i
+ ret double %vpaddd.i
+}
+
+define i64 @foo0(<2 x i64> %a) nounwind {
+; CHECK-LABEL: foo0:
+; CHECK: addp.2d d0, v0
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
+ %lane0.i = extractelement <2 x i64> %a, i32 0
+ %lane1.i = extractelement <2 x i64> %a, i32 1
+ %vpaddd.i = add i64 %lane0.i, %lane1.i
+ ret i64 %vpaddd.i
+}
+
+define float @foo1(<2 x float> %a) nounwind {
+; CHECK-LABEL: foo1:
+; CHECK: faddp.2s
+; CHECK-NEXT: ret
+ %lane0.i = extractelement <2 x float> %a, i32 0
+ %lane1.i = extractelement <2 x float> %a, i32 1
+ %vpaddd.i = fadd float %lane0.i, %lane1.i
+ ret float %vpaddd.i
+}
diff --git a/test/CodeGen/AArch64/arm64-addr-mode-folding.ll b/test/CodeGen/AArch64/arm64-addr-mode-folding.ll
new file mode 100644
index 0000000..08fb8c9
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-addr-mode-folding.ll
@@ -0,0 +1,171 @@
+; RUN: llc -O3 -mtriple arm64-apple-ios3 %s -o - | FileCheck %s
+; <rdar://problem/13621857>
+
+@block = common global i8* null, align 8
+
+define i32 @fct(i32 %i1, i32 %i2) {
+; CHECK: @fct
+; Sign extension is used more than once, thus it should not be folded.
+; CodeGenPrepare is not sharing sext across uses, thus this is folded because
+; of that.
+; _CHECK-NOT_: , sxtw]
+entry:
+ %idxprom = sext i32 %i1 to i64
+ %0 = load i8** @block, align 8
+ %arrayidx = getelementptr inbounds i8* %0, i64 %idxprom
+ %1 = load i8* %arrayidx, align 1
+ %idxprom1 = sext i32 %i2 to i64
+ %arrayidx2 = getelementptr inbounds i8* %0, i64 %idxprom1
+ %2 = load i8* %arrayidx2, align 1
+ %cmp = icmp eq i8 %1, %2
+ br i1 %cmp, label %if.end, label %if.then
+
+if.then: ; preds = %entry
+ %cmp7 = icmp ugt i8 %1, %2
+ %conv8 = zext i1 %cmp7 to i32
+ br label %return
+
+if.end: ; preds = %entry
+ %inc = add nsw i32 %i1, 1
+ %inc9 = add nsw i32 %i2, 1
+ %idxprom10 = sext i32 %inc to i64
+ %arrayidx11 = getelementptr inbounds i8* %0, i64 %idxprom10
+ %3 = load i8* %arrayidx11, align 1
+ %idxprom12 = sext i32 %inc9 to i64
+ %arrayidx13 = getelementptr inbounds i8* %0, i64 %idxprom12
+ %4 = load i8* %arrayidx13, align 1
+ %cmp16 = icmp eq i8 %3, %4
+ br i1 %cmp16, label %if.end23, label %if.then18
+
+if.then18: ; preds = %if.end
+ %cmp21 = icmp ugt i8 %3, %4
+ %conv22 = zext i1 %cmp21 to i32
+ br label %return
+
+if.end23: ; preds = %if.end
+ %inc24 = add nsw i32 %i1, 2
+ %inc25 = add nsw i32 %i2, 2
+ %idxprom26 = sext i32 %inc24 to i64
+ %arrayidx27 = getelementptr inbounds i8* %0, i64 %idxprom26
+ %5 = load i8* %arrayidx27, align 1
+ %idxprom28 = sext i32 %inc25 to i64
+ %arrayidx29 = getelementptr inbounds i8* %0, i64 %idxprom28
+ %6 = load i8* %arrayidx29, align 1
+ %cmp32 = icmp eq i8 %5, %6
+ br i1 %cmp32, label %return, label %if.then34
+
+if.then34: ; preds = %if.end23
+ %cmp37 = icmp ugt i8 %5, %6
+ %conv38 = zext i1 %cmp37 to i32
+ br label %return
+
+return: ; preds = %if.end23, %if.then34, %if.then18, %if.then
+ %retval.0 = phi i32 [ %conv8, %if.then ], [ %conv22, %if.then18 ], [ %conv38, %if.then34 ], [ 1, %if.end23 ]
+ ret i32 %retval.0
+}
+
+define i32 @fct1(i32 %i1, i32 %i2) optsize {
+; CHECK: @fct1
+; Addressing are folded when optimizing for code size.
+; CHECK: , sxtw]
+; CHECK: , sxtw]
+entry:
+ %idxprom = sext i32 %i1 to i64
+ %0 = load i8** @block, align 8
+ %arrayidx = getelementptr inbounds i8* %0, i64 %idxprom
+ %1 = load i8* %arrayidx, align 1
+ %idxprom1 = sext i32 %i2 to i64
+ %arrayidx2 = getelementptr inbounds i8* %0, i64 %idxprom1
+ %2 = load i8* %arrayidx2, align 1
+ %cmp = icmp eq i8 %1, %2
+ br i1 %cmp, label %if.end, label %if.then
+
+if.then: ; preds = %entry
+ %cmp7 = icmp ugt i8 %1, %2
+ %conv8 = zext i1 %cmp7 to i32
+ br label %return
+
+if.end: ; preds = %entry
+ %inc = add nsw i32 %i1, 1
+ %inc9 = add nsw i32 %i2, 1
+ %idxprom10 = sext i32 %inc to i64
+ %arrayidx11 = getelementptr inbounds i8* %0, i64 %idxprom10
+ %3 = load i8* %arrayidx11, align 1
+ %idxprom12 = sext i32 %inc9 to i64
+ %arrayidx13 = getelementptr inbounds i8* %0, i64 %idxprom12
+ %4 = load i8* %arrayidx13, align 1
+ %cmp16 = icmp eq i8 %3, %4
+ br i1 %cmp16, label %if.end23, label %if.then18
+
+if.then18: ; preds = %if.end
+ %cmp21 = icmp ugt i8 %3, %4
+ %conv22 = zext i1 %cmp21 to i32
+ br label %return
+
+if.end23: ; preds = %if.end
+ %inc24 = add nsw i32 %i1, 2
+ %inc25 = add nsw i32 %i2, 2
+ %idxprom26 = sext i32 %inc24 to i64
+ %arrayidx27 = getelementptr inbounds i8* %0, i64 %idxprom26
+ %5 = load i8* %arrayidx27, align 1
+ %idxprom28 = sext i32 %inc25 to i64
+ %arrayidx29 = getelementptr inbounds i8* %0, i64 %idxprom28
+ %6 = load i8* %arrayidx29, align 1
+ %cmp32 = icmp eq i8 %5, %6
+ br i1 %cmp32, label %return, label %if.then34
+
+if.then34: ; preds = %if.end23
+ %cmp37 = icmp ugt i8 %5, %6
+ %conv38 = zext i1 %cmp37 to i32
+ br label %return
+
+return: ; preds = %if.end23, %if.then34, %if.then18, %if.then
+ %retval.0 = phi i32 [ %conv8, %if.then ], [ %conv22, %if.then18 ], [ %conv38, %if.then34 ], [ 1, %if.end23 ]
+ ret i32 %retval.0
+}
+
+; CHECK: @test
+; CHECK-NOT: , uxtw #2]
+define i32 @test(i32* %array, i8 zeroext %c, i32 %arg) {
+entry:
+ %conv = zext i8 %c to i32
+ %add = sub i32 0, %arg
+ %tobool = icmp eq i32 %conv, %add
+ br i1 %tobool, label %if.end, label %if.then
+
+if.then: ; preds = %entry
+ %idxprom = zext i8 %c to i64
+ %arrayidx = getelementptr inbounds i32* %array, i64 %idxprom
+ %0 = load volatile i32* %arrayidx, align 4
+ %1 = load volatile i32* %arrayidx, align 4
+ %add3 = add nsw i32 %1, %0
+ br label %if.end
+
+if.end: ; preds = %entry, %if.then
+ %res.0 = phi i32 [ %add3, %if.then ], [ 0, %entry ]
+ ret i32 %res.0
+}
+
+
+; CHECK: @test2
+; CHECK: , uxtw #2]
+; CHECK: , uxtw #2]
+define i32 @test2(i32* %array, i8 zeroext %c, i32 %arg) optsize {
+entry:
+ %conv = zext i8 %c to i32
+ %add = sub i32 0, %arg
+ %tobool = icmp eq i32 %conv, %add
+ br i1 %tobool, label %if.end, label %if.then
+
+if.then: ; preds = %entry
+ %idxprom = zext i8 %c to i64
+ %arrayidx = getelementptr inbounds i32* %array, i64 %idxprom
+ %0 = load volatile i32* %arrayidx, align 4
+ %1 = load volatile i32* %arrayidx, align 4
+ %add3 = add nsw i32 %1, %0
+ br label %if.end
+
+if.end: ; preds = %entry, %if.then
+ %res.0 = phi i32 [ %add3, %if.then ], [ 0, %entry ]
+ ret i32 %res.0
+}
diff --git a/test/CodeGen/AArch64/arm64-addr-type-promotion.ll b/test/CodeGen/AArch64/arm64-addr-type-promotion.ll
new file mode 100644
index 0000000..1a3ca8b
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-addr-type-promotion.ll
@@ -0,0 +1,82 @@
+; RUN: llc -march arm64 < %s | FileCheck %s
+; rdar://13452552
+; ModuleID = 'reduced_test.ll'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+target triple = "arm64-apple-ios3.0.0"
+
+@block = common global i8* null, align 8
+
+define zeroext i8 @fullGtU(i32 %i1, i32 %i2) {
+; CHECK: fullGtU
+; CHECK: adrp [[PAGE:x[0-9]+]], _block@GOTPAGE
+; CHECK: ldr [[ADDR:x[0-9]+]], {{\[}}[[PAGE]], _block@GOTPAGEOFF]
+; CHECK-NEXT: ldr [[BLOCKBASE:x[0-9]+]], {{\[}}[[ADDR]]]
+; CHECK-NEXT: ldrb [[BLOCKVAL1:w[0-9]+]], {{\[}}[[BLOCKBASE]], w0, sxtw]
+; CHECK-NEXT: ldrb [[BLOCKVAL2:w[0-9]+]], {{\[}}[[BLOCKBASE]], w1, sxtw]
+; CHECK-NEXT cmp [[BLOCKVAL1]], [[BLOCKVAL2]]
+; CHECK-NEXT b.ne
+; Next BB
+; CHECK: add [[BLOCKBASE2:x[0-9]+]], [[BLOCKBASE]], w1, sxtw
+; CHECK-NEXT: add [[BLOCKBASE1:x[0-9]+]], [[BLOCKBASE]], w0, sxtw
+; CHECK-NEXT: ldrb [[LOADEDVAL1:w[0-9]+]], {{\[}}[[BLOCKBASE1]], #1]
+; CHECK-NEXT: ldrb [[LOADEDVAL2:w[0-9]+]], {{\[}}[[BLOCKBASE2]], #1]
+; CHECK-NEXT: cmp [[LOADEDVAL1]], [[LOADEDVAL2]]
+; CHECK-NEXT: b.ne
+; Next BB
+; CHECK: ldrb [[LOADEDVAL3:w[0-9]+]], {{\[}}[[BLOCKBASE1]], #2]
+; CHECK-NEXT: ldrb [[LOADEDVAL4:w[0-9]+]], {{\[}}[[BLOCKBASE2]], #2]
+; CHECK-NEXT: cmp [[LOADEDVAL3]], [[LOADEDVAL4]]
+entry:
+ %idxprom = sext i32 %i1 to i64
+ %tmp = load i8** @block, align 8
+ %arrayidx = getelementptr inbounds i8* %tmp, i64 %idxprom
+ %tmp1 = load i8* %arrayidx, align 1
+ %idxprom1 = sext i32 %i2 to i64
+ %arrayidx2 = getelementptr inbounds i8* %tmp, i64 %idxprom1
+ %tmp2 = load i8* %arrayidx2, align 1
+ %cmp = icmp eq i8 %tmp1, %tmp2
+ br i1 %cmp, label %if.end, label %if.then
+
+if.then: ; preds = %entry
+ %cmp7 = icmp ugt i8 %tmp1, %tmp2
+ %conv9 = zext i1 %cmp7 to i8
+ br label %return
+
+if.end: ; preds = %entry
+ %inc = add nsw i32 %i1, 1
+ %inc10 = add nsw i32 %i2, 1
+ %idxprom11 = sext i32 %inc to i64
+ %arrayidx12 = getelementptr inbounds i8* %tmp, i64 %idxprom11
+ %tmp3 = load i8* %arrayidx12, align 1
+ %idxprom13 = sext i32 %inc10 to i64
+ %arrayidx14 = getelementptr inbounds i8* %tmp, i64 %idxprom13
+ %tmp4 = load i8* %arrayidx14, align 1
+ %cmp17 = icmp eq i8 %tmp3, %tmp4
+ br i1 %cmp17, label %if.end25, label %if.then19
+
+if.then19: ; preds = %if.end
+ %cmp22 = icmp ugt i8 %tmp3, %tmp4
+ %conv24 = zext i1 %cmp22 to i8
+ br label %return
+
+if.end25: ; preds = %if.end
+ %inc26 = add nsw i32 %i1, 2
+ %inc27 = add nsw i32 %i2, 2
+ %idxprom28 = sext i32 %inc26 to i64
+ %arrayidx29 = getelementptr inbounds i8* %tmp, i64 %idxprom28
+ %tmp5 = load i8* %arrayidx29, align 1
+ %idxprom30 = sext i32 %inc27 to i64
+ %arrayidx31 = getelementptr inbounds i8* %tmp, i64 %idxprom30
+ %tmp6 = load i8* %arrayidx31, align 1
+ %cmp34 = icmp eq i8 %tmp5, %tmp6
+ br i1 %cmp34, label %return, label %if.then36
+
+if.then36: ; preds = %if.end25
+ %cmp39 = icmp ugt i8 %tmp5, %tmp6
+ %conv41 = zext i1 %cmp39 to i8
+ br label %return
+
+return: ; preds = %if.then36, %if.end25, %if.then19, %if.then
+ %retval.0 = phi i8 [ %conv9, %if.then ], [ %conv24, %if.then19 ], [ %conv41, %if.then36 ], [ 0, %if.end25 ]
+ ret i8 %retval.0
+}
diff --git a/test/CodeGen/AArch64/arm64-addrmode.ll b/test/CodeGen/AArch64/arm64-addrmode.ll
new file mode 100644
index 0000000..700fba8
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-addrmode.ll
@@ -0,0 +1,72 @@
+; RUN: llc -march=arm64 < %s | FileCheck %s
+; rdar://10232252
+
+@object = external hidden global i64, section "__DATA, __objc_ivar", align 8
+
+; base + offset (imm9)
+; CHECK: @t1
+; CHECK: ldr xzr, [x{{[0-9]+}}, #8]
+; CHECK: ret
+define void @t1() {
+ %incdec.ptr = getelementptr inbounds i64* @object, i64 1
+ %tmp = load volatile i64* %incdec.ptr, align 8
+ ret void
+}
+
+; base + offset (> imm9)
+; CHECK: @t2
+; CHECK: sub [[ADDREG:x[0-9]+]], x{{[0-9]+}}, #264
+; CHECK: ldr xzr, [
+; CHECK: [[ADDREG]]]
+; CHECK: ret
+define void @t2() {
+ %incdec.ptr = getelementptr inbounds i64* @object, i64 -33
+ %tmp = load volatile i64* %incdec.ptr, align 8
+ ret void
+}
+
+; base + unsigned offset (> imm9 and <= imm12 * size of type in bytes)
+; CHECK: @t3
+; CHECK: ldr xzr, [x{{[0-9]+}}, #32760]
+; CHECK: ret
+define void @t3() {
+ %incdec.ptr = getelementptr inbounds i64* @object, i64 4095
+ %tmp = load volatile i64* %incdec.ptr, align 8
+ ret void
+}
+
+; base + unsigned offset (> imm12 * size of type in bytes)
+; CHECK: @t4
+; CHECK: add [[ADDREG:x[0-9]+]], x{{[0-9]+}}, #8, lsl #12
+; CHECK: ldr xzr, [
+; CHECK: [[ADDREG]]]
+; CHECK: ret
+define void @t4() {
+ %incdec.ptr = getelementptr inbounds i64* @object, i64 4096
+ %tmp = load volatile i64* %incdec.ptr, align 8
+ ret void
+}
+
+; base + reg
+; CHECK: @t5
+; CHECK: ldr xzr, [x{{[0-9]+}}, x{{[0-9]+}}, lsl #3]
+; CHECK: ret
+define void @t5(i64 %a) {
+ %incdec.ptr = getelementptr inbounds i64* @object, i64 %a
+ %tmp = load volatile i64* %incdec.ptr, align 8
+ ret void
+}
+
+; base + reg + imm
+; CHECK: @t6
+; CHECK: add [[ADDREG:x[0-9]+]], x{{[0-9]+}}, x{{[0-9]+}}, lsl #3
+; CHECK-NEXT: add [[ADDREG]], [[ADDREG]], #8, lsl #12
+; CHECK: ldr xzr, [
+; CHECK: [[ADDREG]]]
+; CHECK: ret
+define void @t6(i64 %a) {
+ %tmp1 = getelementptr inbounds i64* @object, i64 %a
+ %incdec.ptr = getelementptr inbounds i64* %tmp1, i64 4096
+ %tmp = load volatile i64* %incdec.ptr, align 8
+ ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-alloc-no-stack-realign.ll b/test/CodeGen/AArch64/arm64-alloc-no-stack-realign.ll
new file mode 100644
index 0000000..f396bc9
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-alloc-no-stack-realign.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin -enable-misched=false | FileCheck %s
+
+; rdar://12713765
+; Make sure we are not creating stack objects that are assumed to be 64-byte
+; aligned.
+@T3_retval = common global <16 x float> zeroinitializer, align 16
+
+define void @test(<16 x float>* noalias sret %agg.result) nounwind ssp {
+entry:
+; CHECK: test
+; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], [sp, #32]
+; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], [sp]
+; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], {{\[}}[[BASE:x[0-9]+]], #32]
+; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], {{\[}}[[BASE]]]
+ %retval = alloca <16 x float>, align 16
+ %0 = load <16 x float>* @T3_retval, align 16
+ store <16 x float> %0, <16 x float>* %retval
+ %1 = load <16 x float>* %retval
+ store <16 x float> %1, <16 x float>* %agg.result, align 16
+ ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll b/test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll
new file mode 100644
index 0000000..3750f31
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll
@@ -0,0 +1,29 @@
+; RUN: llc -march=arm64 -mcpu=cyclone < %s | FileCheck %s
+
+; CHECK: foo
+; CHECK: ldr w[[REG:[0-9]+]], [x19, #264]
+; CHECK: str w[[REG]], [x19, #132]
+; CHECK: ldr w{{[0-9]+}}, [x19, #264]
+
+define i32 @foo(i32 %a) nounwind {
+ %retval = alloca i32, align 4
+ %a.addr = alloca i32, align 4
+ %arr = alloca [32 x i32], align 4
+ %i = alloca i32, align 4
+ %arr2 = alloca [32 x i32], align 4
+ %j = alloca i32, align 4
+ store i32 %a, i32* %a.addr, align 4
+ %tmp = load i32* %a.addr, align 4
+ %tmp1 = zext i32 %tmp to i64
+ %v = mul i64 4, %tmp1
+ %vla = alloca i8, i64 %v, align 4
+ %tmp2 = bitcast i8* %vla to i32*
+ %tmp3 = load i32* %a.addr, align 4
+ store i32 %tmp3, i32* %i, align 4
+ %tmp4 = load i32* %a.addr, align 4
+ store i32 %tmp4, i32* %j, align 4
+ %tmp5 = load i32* %j, align 4
+ store i32 %tmp5, i32* %retval
+ %x = load i32* %retval
+ ret i32 %x
+}
diff --git a/test/CodeGen/AArch64/arm64-andCmpBrToTBZ.ll b/test/CodeGen/AArch64/arm64-andCmpBrToTBZ.ll
new file mode 100644
index 0000000..4194977
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-andCmpBrToTBZ.ll
@@ -0,0 +1,72 @@
+; RUN: llc -O1 -march=arm64 -enable-andcmp-sinking=true < %s | FileCheck %s
+; ModuleID = 'and-cbz-extr-mr.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+target triple = "arm64-apple-ios7.0.0"
+
+define zeroext i1 @foo(i1 %IsEditable, i1 %isTextField, i8* %str1, i8* %str2, i8* %str3, i8* %str4, i8* %str5, i8* %str6, i8* %str7, i8* %str8, i8* %str9, i8* %str10, i8* %str11, i8* %str12, i8* %str13, i32 %int1, i8* %str14) unnamed_addr #0 align 2 {
+; CHECK: _foo:
+entry:
+ %tobool = icmp eq i8* %str14, null
+ br i1 %tobool, label %return, label %if.end
+
+; CHECK: %if.end
+; CHECK: tbz
+if.end: ; preds = %entry
+ %and.i.i.i = and i32 %int1, 4
+ %tobool.i.i.i = icmp eq i32 %and.i.i.i, 0
+ br i1 %tobool.i.i.i, label %if.end12, label %land.rhs.i
+
+land.rhs.i: ; preds = %if.end
+ %cmp.i.i.i = icmp eq i8* %str12, %str13
+ br i1 %cmp.i.i.i, label %if.then3, label %lor.rhs.i.i.i
+
+lor.rhs.i.i.i: ; preds = %land.rhs.i
+ %cmp.i13.i.i.i = icmp eq i8* %str10, %str11
+ br i1 %cmp.i13.i.i.i, label %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit, label %if.end5
+
+_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit: ; preds = %lor.rhs.i.i.i
+ %cmp.i.i.i.i = icmp eq i8* %str8, %str9
+ br i1 %cmp.i.i.i.i, label %if.then3, label %if.end5
+
+if.then3: ; preds = %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit, %land.rhs.i
+ %tmp11 = load i8* %str14, align 8
+ %tmp12 = and i8 %tmp11, 2
+ %tmp13 = icmp ne i8 %tmp12, 0
+ br label %return
+
+if.end5: ; preds = %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit, %lor.rhs.i.i.i
+; CHECK: %if.end5
+; CHECK: tbz
+ br i1 %tobool.i.i.i, label %if.end12, label %land.rhs.i19
+
+land.rhs.i19: ; preds = %if.end5
+ %cmp.i.i.i18 = icmp eq i8* %str6, %str7
+ br i1 %cmp.i.i.i18, label %if.then7, label %lor.rhs.i.i.i23
+
+lor.rhs.i.i.i23: ; preds = %land.rhs.i19
+ %cmp.i13.i.i.i22 = icmp eq i8* %str3, %str4
+ br i1 %cmp.i13.i.i.i22, label %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit28, label %if.end12
+
+_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit28: ; preds = %lor.rhs.i.i.i23
+ %cmp.i.i.i.i26 = icmp eq i8* %str1, %str2
+ br i1 %cmp.i.i.i.i26, label %if.then7, label %if.end12
+
+if.then7: ; preds = %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit28, %land.rhs.i19
+ br i1 %isTextField, label %if.then9, label %if.end12
+
+if.then9: ; preds = %if.then7
+ %tmp23 = load i8* %str5, align 8
+ %tmp24 = and i8 %tmp23, 2
+ %tmp25 = icmp ne i8 %tmp24, 0
+ br label %return
+
+if.end12: ; preds = %if.then7, %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit28, %lor.rhs.i.i.i23, %if.end5, %if.end
+ %lnot = xor i1 %IsEditable, true
+ br label %return
+
+return: ; preds = %if.end12, %if.then9, %if.then3, %entry
+ %retval.0 = phi i1 [ %tmp13, %if.then3 ], [ %tmp25, %if.then9 ], [ %lnot, %if.end12 ], [ true, %entry ]
+ ret i1 %retval.0
+}
+
+attributes #0 = { nounwind ssp }
diff --git a/test/CodeGen/AArch64/arm64-ands-bad-peephole.ll b/test/CodeGen/AArch64/arm64-ands-bad-peephole.ll
new file mode 100644
index 0000000..34d6287
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-ands-bad-peephole.ll
@@ -0,0 +1,31 @@
+; RUN: llc %s -o - | FileCheck %s
+; Check that ANDS (tst) is not merged with ADD when the immediate
+; is not 0.
+; <rdar://problem/16693089>
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios"
+
+; CHECK-LABEL: tst1:
+; CHECK: add [[REG:w[0-9]+]], w{{[0-9]+}}, #1
+; CHECK: tst [[REG]], #0x1
+define void @tst1() {
+entry:
+ br i1 undef, label %for.end, label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %result.09 = phi i32 [ %add2.result.0, %for.body ], [ 1, %entry ]
+ %i.08 = phi i32 [ %inc, %for.body ], [ 2, %entry ]
+ %and = and i32 %i.08, 1
+ %cmp1 = icmp eq i32 %and, 0
+ %add2.result.0 = select i1 %cmp1, i32 undef, i32 %result.09
+ %inc = add nsw i32 %i.08, 1
+ %cmp = icmp slt i32 %i.08, undef
+ br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge: ; preds = %for.body
+ %add2.result.0.lcssa = phi i32 [ %add2.result.0, %for.body ]
+ br label %for.end
+
+for.end: ; preds = %for.cond.for.end_crit_edge, %entry
+ ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-anyregcc-crash.ll b/test/CodeGen/AArch64/arm64-anyregcc-crash.ll
new file mode 100644
index 0000000..241cf97
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-anyregcc-crash.ll
@@ -0,0 +1,19 @@
+; RUN: not llc < %s -mtriple=arm64-apple-darwin 2>&1 | FileCheck %s
+;
+; Check that misuse of anyregcc results in a compile time error.
+
+; CHECK: LLVM ERROR: ran out of registers during register allocation
+define i64 @anyreglimit(i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5, i64 %v6, i64 %v7, i64 %v8,
+ i64 %v9, i64 %v10, i64 %v11, i64 %v12, i64 %v13, i64 %v14, i64 %v15, i64 %v16,
+ i64 %v17, i64 %v18, i64 %v19, i64 %v20, i64 %v21, i64 %v22, i64 %v23, i64 %v24,
+ i64 %v25, i64 %v26, i64 %v27, i64 %v28, i64 %v29, i64 %v30, i64 %v31, i64 %v32) {
+entry:
+ %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 12, i32 15, i8* inttoptr (i64 0 to i8*), i32 32,
+ i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5, i64 %v6, i64 %v7, i64 %v8,
+ i64 %v9, i64 %v10, i64 %v11, i64 %v12, i64 %v13, i64 %v14, i64 %v15, i64 %v16,
+ i64 %v17, i64 %v18, i64 %v19, i64 %v20, i64 %v21, i64 %v22, i64 %v23, i64 %v24,
+ i64 %v25, i64 %v26, i64 %v27, i64 %v28, i64 %v29, i64 %v30, i64 %v31, i64 %v32)
+ ret i64 %result
+}
+
+declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
diff --git a/test/CodeGen/AArch64/arm64-anyregcc.ll b/test/CodeGen/AArch64/arm64-anyregcc.ll
new file mode 100644
index 0000000..e26875d
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-anyregcc.ll
@@ -0,0 +1,363 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
+
+; Stackmap Header: no constants - 6 callsites
+; CHECK-LABEL: .section __LLVM_STACKMAPS,__llvm_stackmaps
+; CHECK-NEXT: __LLVM_StackMaps:
+; Header
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .short 0
+; Num Functions
+; CHECK-NEXT: .long 8
+; Num LargeConstants
+; CHECK-NEXT: .long 0
+; Num Callsites
+; CHECK-NEXT: .long 8
+
+; Functions and stack size
+; CHECK-NEXT: .quad _test
+; CHECK-NEXT: .quad 16
+; CHECK-NEXT: .quad _property_access1
+; CHECK-NEXT: .quad 16
+; CHECK-NEXT: .quad _property_access2
+; CHECK-NEXT: .quad 32
+; CHECK-NEXT: .quad _property_access3
+; CHECK-NEXT: .quad 32
+; CHECK-NEXT: .quad _anyreg_test1
+; CHECK-NEXT: .quad 16
+; CHECK-NEXT: .quad _anyreg_test2
+; CHECK-NEXT: .quad 16
+; CHECK-NEXT: .quad _patchpoint_spilldef
+; CHECK-NEXT: .quad 112
+; CHECK-NEXT: .quad _patchpoint_spillargs
+; CHECK-NEXT: .quad 128
+
+
+; test
+; CHECK-LABEL: .long L{{.*}}-_test
+; CHECK-NEXT: .short 0
+; 3 locations
+; CHECK-NEXT: .short 3
+; Loc 0: Register
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+; Loc 1: Register
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+; Loc 2: Constant 3
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .long 3
+define i64 @test() nounwind ssp uwtable {
+entry:
+ call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 0, i32 16, i8* null, i32 2, i32 1, i32 2, i64 3)
+ ret i64 0
+}
+
+; property access 1 - %obj is an anyreg call argument and should therefore be in a register
+; CHECK-LABEL: .long L{{.*}}-_property_access1
+; CHECK-NEXT: .short 0
+; 2 locations
+; CHECK-NEXT: .short 2
+; Loc 0: Register <-- this is the return register
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+; Loc 1: Register
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+define i64 @property_access1(i8* %obj) nounwind ssp uwtable {
+entry:
+ %f = inttoptr i64 281474417671919 to i8*
+ %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 1, i32 20, i8* %f, i32 1, i8* %obj)
+ ret i64 %ret
+}
+
+; property access 2 - %obj is an anyreg call argument and should therefore be in a register
+; CHECK-LABEL: .long L{{.*}}-_property_access2
+; CHECK-NEXT: .short 0
+; 2 locations
+; CHECK-NEXT: .short 2
+; Loc 0: Register <-- this is the return register
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+; Loc 1: Register
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+define i64 @property_access2() nounwind ssp uwtable {
+entry:
+ %obj = alloca i64, align 8
+ %f = inttoptr i64 281474417671919 to i8*
+ %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 2, i32 20, i8* %f, i32 1, i64* %obj)
+ ret i64 %ret
+}
+
+; property access 3 - %obj is a frame index
+; CHECK-LABEL: .long L{{.*}}-_property_access3
+; CHECK-NEXT: .short 0
+; 2 locations
+; CHECK-NEXT: .short 2
+; Loc 0: Register <-- this is the return register
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+; Loc 1: Direct FP - 8
+; CHECK-NEXT: .byte 2
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short 29
+; CHECK-NEXT: .long -8
+define i64 @property_access3() nounwind ssp uwtable {
+entry:
+ %obj = alloca i64, align 8
+ %f = inttoptr i64 281474417671919 to i8*
+ %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 3, i32 20, i8* %f, i32 0, i64* %obj)
+ ret i64 %ret
+}
+
+; anyreg_test1
+; CHECK-LABEL: .long L{{.*}}-_anyreg_test1
+; CHECK-NEXT: .short 0
+; 14 locations
+; CHECK-NEXT: .short 14
+; Loc 0: Register <-- this is the return register
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+; Loc 1: Register
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+; Loc 2: Register
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+; Loc 3: Register
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+; Loc 4: Register
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+; Loc 5: Register
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+; Loc 6: Register
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+; Loc 7: Register
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+; Loc 8: Register
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+; Loc 9: Register
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+; Loc 10: Register
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+; Loc 11: Register
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+; Loc 12: Register
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+; Loc 13: Register
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+define i64 @anyreg_test1(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable {
+entry:
+ %f = inttoptr i64 281474417671919 to i8*
+ %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 4, i32 20, i8* %f, i32 13, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
+ ret i64 %ret
+}
+
+; anyreg_test2
+; CHECK-LABEL: .long L{{.*}}-_anyreg_test2
+; CHECK-NEXT: .short 0
+; 14 locations
+; CHECK-NEXT: .short 14
+; Loc 0: Register <-- this is the return register
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+; Loc 1: Register
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+; Loc 2: Register
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+; Loc 3: Register
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+; Loc 4: Register
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+; Loc 5: Register
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+; Loc 6: Register
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+; Loc 7: Register
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+; Loc 8: Register
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+; Loc 9: Register
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+; Loc 10: Register
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+; Loc 11: Register
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+; Loc 12: Register
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+; Loc 13: Register
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+define i64 @anyreg_test2(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable {
+entry:
+ %f = inttoptr i64 281474417671919 to i8*
+ %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %f, i32 8, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
+ ret i64 %ret
+}
+
+; Test spilling the return value of an anyregcc call.
+;
+; <rdar://problem/15432754> [JS] Assertion: "Folded a def to a non-store!"
+;
+; CHECK-LABEL: .long L{{.*}}-_patchpoint_spilldef
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .short 3
+; Loc 0: Register (some register that will be spilled to the stack)
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+; Loc 1: Register
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+; Loc 1: Register
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+define i64 @patchpoint_spilldef(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+ %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 12, i32 16, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2)
+ tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"() nounwind
+ ret i64 %result
+}
+
+; Test spilling the arguments of an anyregcc call.
+;
+; <rdar://problem/15487687> [JS] AnyRegCC argument ends up being spilled
+;
+; CHECK-LABEL: .long L{{.*}}-_patchpoint_spillargs
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .short 5
+; Loc 0: Return a register
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+; Loc 1: Arg0 in a Register
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+; Loc 2: Arg1 in a Register
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+; Loc 3: Arg2 spilled to FP -96
+; CHECK-NEXT: .byte 3
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short 29
+; CHECK-NEXT: .long -96
+; Loc 4: Arg3 spilled to FP - 88
+; CHECK-NEXT: .byte 3
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short 29
+; CHECK-NEXT: .long -88
+define i64 @patchpoint_spillargs(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+ tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"() nounwind
+ %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 13, i32 16, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
+ ret i64 %result
+}
+
+declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
+declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
diff --git a/test/CodeGen/AArch64/arm64-arith-saturating.ll b/test/CodeGen/AArch64/arm64-arith-saturating.ll
new file mode 100644
index 0000000..78cd1fc
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-arith-saturating.ll
@@ -0,0 +1,153 @@
+; RUN: llc < %s -march=arm64 -mcpu=cyclone | FileCheck %s
+
+define i32 @qadds(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: qadds:
+; CHECK: sqadd s0, s0, s1
+ %vecext = extractelement <4 x i32> %b, i32 0
+ %vecext1 = extractelement <4 x i32> %c, i32 0
+ %vqadd.i = tail call i32 @llvm.aarch64.neon.sqadd.i32(i32 %vecext, i32 %vecext1) nounwind
+ ret i32 %vqadd.i
+}
+
+define i64 @qaddd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: qaddd:
+; CHECK: sqadd d0, d0, d1
+ %vecext = extractelement <2 x i64> %b, i32 0
+ %vecext1 = extractelement <2 x i64> %c, i32 0
+ %vqadd.i = tail call i64 @llvm.aarch64.neon.sqadd.i64(i64 %vecext, i64 %vecext1) nounwind
+ ret i64 %vqadd.i
+}
+
+define i32 @uqadds(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: uqadds:
+; CHECK: uqadd s0, s0, s1
+ %vecext = extractelement <4 x i32> %b, i32 0
+ %vecext1 = extractelement <4 x i32> %c, i32 0
+ %vqadd.i = tail call i32 @llvm.aarch64.neon.uqadd.i32(i32 %vecext, i32 %vecext1) nounwind
+ ret i32 %vqadd.i
+}
+
+define i64 @uqaddd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: uqaddd:
+; CHECK: uqadd d0, d0, d1
+ %vecext = extractelement <2 x i64> %b, i32 0
+ %vecext1 = extractelement <2 x i64> %c, i32 0
+ %vqadd.i = tail call i64 @llvm.aarch64.neon.uqadd.i64(i64 %vecext, i64 %vecext1) nounwind
+ ret i64 %vqadd.i
+}
+
+declare i64 @llvm.aarch64.neon.uqadd.i64(i64, i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.uqadd.i32(i32, i32) nounwind readnone
+declare i64 @llvm.aarch64.neon.sqadd.i64(i64, i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32) nounwind readnone
+
+define i32 @qsubs(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: qsubs:
+; CHECK: sqsub s0, s0, s1
+ %vecext = extractelement <4 x i32> %b, i32 0
+ %vecext1 = extractelement <4 x i32> %c, i32 0
+ %vqsub.i = tail call i32 @llvm.aarch64.neon.sqsub.i32(i32 %vecext, i32 %vecext1) nounwind
+ ret i32 %vqsub.i
+}
+
+define i64 @qsubd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: qsubd:
+; CHECK: sqsub d0, d0, d1
+ %vecext = extractelement <2 x i64> %b, i32 0
+ %vecext1 = extractelement <2 x i64> %c, i32 0
+ %vqsub.i = tail call i64 @llvm.aarch64.neon.sqsub.i64(i64 %vecext, i64 %vecext1) nounwind
+ ret i64 %vqsub.i
+}
+
+define i32 @uqsubs(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: uqsubs:
+; CHECK: uqsub s0, s0, s1
+ %vecext = extractelement <4 x i32> %b, i32 0
+ %vecext1 = extractelement <4 x i32> %c, i32 0
+ %vqsub.i = tail call i32 @llvm.aarch64.neon.uqsub.i32(i32 %vecext, i32 %vecext1) nounwind
+ ret i32 %vqsub.i
+}
+
+define i64 @uqsubd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: uqsubd:
+; CHECK: uqsub d0, d0, d1
+ %vecext = extractelement <2 x i64> %b, i32 0
+ %vecext1 = extractelement <2 x i64> %c, i32 0
+ %vqsub.i = tail call i64 @llvm.aarch64.neon.uqsub.i64(i64 %vecext, i64 %vecext1) nounwind
+ ret i64 %vqsub.i
+}
+
+declare i64 @llvm.aarch64.neon.uqsub.i64(i64, i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.uqsub.i32(i32, i32) nounwind readnone
+declare i64 @llvm.aarch64.neon.sqsub.i64(i64, i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32) nounwind readnone
+
+define i32 @qabss(<4 x i32> %b, <4 x i32> %c) nounwind readnone {
+; CHECK-LABEL: qabss:
+; CHECK: sqabs s0, s0
+; CHECK: ret
+ %vecext = extractelement <4 x i32> %b, i32 0
+ %vqabs.i = tail call i32 @llvm.aarch64.neon.sqabs.i32(i32 %vecext) nounwind
+ ret i32 %vqabs.i
+}
+
+define i64 @qabsd(<2 x i64> %b, <2 x i64> %c) nounwind readnone {
+; CHECK-LABEL: qabsd:
+; CHECK: sqabs d0, d0
+; CHECK: ret
+ %vecext = extractelement <2 x i64> %b, i32 0
+ %vqabs.i = tail call i64 @llvm.aarch64.neon.sqabs.i64(i64 %vecext) nounwind
+ ret i64 %vqabs.i
+}
+
+define i32 @qnegs(<4 x i32> %b, <4 x i32> %c) nounwind readnone {
+; CHECK-LABEL: qnegs:
+; CHECK: sqneg s0, s0
+; CHECK: ret
+ %vecext = extractelement <4 x i32> %b, i32 0
+ %vqneg.i = tail call i32 @llvm.aarch64.neon.sqneg.i32(i32 %vecext) nounwind
+ ret i32 %vqneg.i
+}
+
+define i64 @qnegd(<2 x i64> %b, <2 x i64> %c) nounwind readnone {
+; CHECK-LABEL: qnegd:
+; CHECK: sqneg d0, d0
+; CHECK: ret
+ %vecext = extractelement <2 x i64> %b, i32 0
+ %vqneg.i = tail call i64 @llvm.aarch64.neon.sqneg.i64(i64 %vecext) nounwind
+ ret i64 %vqneg.i
+}
+
+declare i64 @llvm.aarch64.neon.sqneg.i64(i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.sqneg.i32(i32) nounwind readnone
+declare i64 @llvm.aarch64.neon.sqabs.i64(i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.sqabs.i32(i32) nounwind readnone
+
+
+define i32 @vqmovund(<2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: vqmovund:
+; CHECK: sqxtun s0, d0
+ %vecext = extractelement <2 x i64> %b, i32 0
+ %vqmovun.i = tail call i32 @llvm.aarch64.neon.scalar.sqxtun.i32.i64(i64 %vecext) nounwind
+ ret i32 %vqmovun.i
+}
+
+define i32 @vqmovnd_s(<2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: vqmovnd_s:
+; CHECK: sqxtn s0, d0
+ %vecext = extractelement <2 x i64> %b, i32 0
+ %vqmovn.i = tail call i32 @llvm.aarch64.neon.scalar.sqxtn.i32.i64(i64 %vecext) nounwind
+ ret i32 %vqmovn.i
+}
+
+define i32 @vqmovnd_u(<2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: vqmovnd_u:
+; CHECK: uqxtn s0, d0
+ %vecext = extractelement <2 x i64> %b, i32 0
+ %vqmovn.i = tail call i32 @llvm.aarch64.neon.scalar.uqxtn.i32.i64(i64 %vecext) nounwind
+ ret i32 %vqmovn.i
+}
+
+declare i32 @llvm.aarch64.neon.scalar.uqxtn.i32.i64(i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.scalar.sqxtn.i32.i64(i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.scalar.sqxtun.i32.i64(i64) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-arith.ll b/test/CodeGen/AArch64/arm64-arith.ll
new file mode 100644
index 0000000..ed9b569
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-arith.ll
@@ -0,0 +1,262 @@
+; RUN: llc < %s -march=arm64 -asm-verbose=false | FileCheck %s
+
+define i32 @t1(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: add w0, w1, w0
+; CHECK: ret
+ %add = add i32 %b, %a
+ ret i32 %add
+}
+
+define i32 @t2(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: udiv w0, w0, w1
+; CHECK: ret
+ %udiv = udiv i32 %a, %b
+ ret i32 %udiv
+}
+
+define i64 @t3(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t3:
+; CHECK: udiv x0, x0, x1
+; CHECK: ret
+ %udiv = udiv i64 %a, %b
+ ret i64 %udiv
+}
+
+define i32 @t4(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t4:
+; CHECK: sdiv w0, w0, w1
+; CHECK: ret
+ %sdiv = sdiv i32 %a, %b
+ ret i32 %sdiv
+}
+
+define i64 @t5(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t5:
+; CHECK: sdiv x0, x0, x1
+; CHECK: ret
+ %sdiv = sdiv i64 %a, %b
+ ret i64 %sdiv
+}
+
+define i32 @t6(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t6:
+; CHECK: lsl w0, w0, w1
+; CHECK: ret
+ %shl = shl i32 %a, %b
+ ret i32 %shl
+}
+
+define i64 @t7(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t7:
+; CHECK: lsl x0, x0, x1
+; CHECK: ret
+ %shl = shl i64 %a, %b
+ ret i64 %shl
+}
+
+define i32 @t8(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t8:
+; CHECK: lsr w0, w0, w1
+; CHECK: ret
+ %lshr = lshr i32 %a, %b
+ ret i32 %lshr
+}
+
+define i64 @t9(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t9:
+; CHECK: lsr x0, x0, x1
+; CHECK: ret
+ %lshr = lshr i64 %a, %b
+ ret i64 %lshr
+}
+
+define i32 @t10(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t10:
+; CHECK: asr w0, w0, w1
+; CHECK: ret
+ %ashr = ashr i32 %a, %b
+ ret i32 %ashr
+}
+
+define i64 @t11(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t11:
+; CHECK: asr x0, x0, x1
+; CHECK: ret
+ %ashr = ashr i64 %a, %b
+ ret i64 %ashr
+}
+
+define i32 @t12(i16 %a, i32 %x) nounwind ssp {
+entry:
+; CHECK-LABEL: t12:
+; CHECK: add w0, w1, w0, sxth
+; CHECK: ret
+ %c = sext i16 %a to i32
+ %e = add i32 %x, %c
+ ret i32 %e
+}
+
+define i32 @t13(i16 %a, i32 %x) nounwind ssp {
+entry:
+; CHECK-LABEL: t13:
+; CHECK: add w0, w1, w0, sxth #2
+; CHECK: ret
+ %c = sext i16 %a to i32
+ %d = shl i32 %c, 2
+ %e = add i32 %x, %d
+ ret i32 %e
+}
+
+define i64 @t14(i16 %a, i64 %x) nounwind ssp {
+entry:
+; CHECK-LABEL: t14:
+; CHECK: add x0, x1, w0, uxth #3
+; CHECK: ret
+ %c = zext i16 %a to i64
+ %d = shl i64 %c, 3
+ %e = add i64 %x, %d
+ ret i64 %e
+}
+
+; rdar://9160598
+define i64 @t15(i64 %a, i64 %x) nounwind ssp {
+entry:
+; CHECK-LABEL: t15:
+; CHECK: add x0, x1, w0, uxtw
+; CHECK: ret
+ %b = and i64 %a, 4294967295
+ %c = add i64 %x, %b
+ ret i64 %c
+}
+
+define i64 @t16(i64 %x) nounwind ssp {
+entry:
+; CHECK-LABEL: t16:
+; CHECK: lsl x0, x0, #1
+; CHECK: ret
+ %a = shl i64 %x, 1
+ ret i64 %a
+}
+
+; rdar://9166974
+define i64 @t17(i16 %a, i64 %x) nounwind ssp {
+entry:
+; CHECK-LABEL: t17:
+; CHECK: sxth [[REG:x[0-9]+]], w0
+; CHECK: neg x0, [[REG]], lsl #32
+; CHECK: ret
+ %tmp16 = sext i16 %a to i64
+ %tmp17 = mul i64 %tmp16, -4294967296
+ ret i64 %tmp17
+}
+
+define i32 @t18(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t18:
+; CHECK: sdiv w0, w0, w1
+; CHECK: ret
+ %sdiv = call i32 @llvm.aarch64.sdiv.i32(i32 %a, i32 %b)
+ ret i32 %sdiv
+}
+
+define i64 @t19(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t19:
+; CHECK: sdiv x0, x0, x1
+; CHECK: ret
+ %sdiv = call i64 @llvm.aarch64.sdiv.i64(i64 %a, i64 %b)
+ ret i64 %sdiv
+}
+
+define i32 @t20(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t20:
+; CHECK: udiv w0, w0, w1
+; CHECK: ret
+ %udiv = call i32 @llvm.aarch64.udiv.i32(i32 %a, i32 %b)
+ ret i32 %udiv
+}
+
+define i64 @t21(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t21:
+; CHECK: udiv x0, x0, x1
+; CHECK: ret
+ %udiv = call i64 @llvm.aarch64.udiv.i64(i64 %a, i64 %b)
+ ret i64 %udiv
+}
+
+declare i32 @llvm.aarch64.sdiv.i32(i32, i32) nounwind readnone
+declare i64 @llvm.aarch64.sdiv.i64(i64, i64) nounwind readnone
+declare i32 @llvm.aarch64.udiv.i32(i32, i32) nounwind readnone
+declare i64 @llvm.aarch64.udiv.i64(i64, i64) nounwind readnone
+
+; 32-bit not.
+define i32 @inv_32(i32 %x) nounwind ssp {
+entry:
+; CHECK: inv_32
+; CHECK: mvn w0, w0
+; CHECK: ret
+ %inv = xor i32 %x, -1
+ ret i32 %inv
+}
+
+; 64-bit not.
+define i64 @inv_64(i64 %x) nounwind ssp {
+entry:
+; CHECK: inv_64
+; CHECK: mvn x0, x0
+; CHECK: ret
+ %inv = xor i64 %x, -1
+ ret i64 %inv
+}
+
+; Multiplying by a power of two plus or minus one is better done via shift
+; and add/sub rather than the madd/msub instructions. The latter are 4+ cycles,
+; and the former are two (total for the two instruction sequence for subtract).
+define i32 @f0(i32 %a) nounwind readnone ssp {
+; CHECK-LABEL: f0:
+; CHECK-NEXT: add w0, w0, w0, lsl #3
+; CHECK-NEXT: ret
+ %res = mul i32 %a, 9
+ ret i32 %res
+}
+
+define i64 @f1(i64 %a) nounwind readnone ssp {
+; CHECK-LABEL: f1:
+; CHECK-NEXT: lsl x8, x0, #4
+; CHECK-NEXT: sub x0, x8, x0
+; CHECK-NEXT: ret
+ %res = mul i64 %a, 15
+ ret i64 %res
+}
+
+define i32 @f2(i32 %a) nounwind readnone ssp {
+; CHECK-LABEL: f2:
+; CHECK-NEXT: lsl w8, w0, #3
+; CHECK-NEXT: sub w0, w8, w0
+; CHECK-NEXT: ret
+ %res = mul nsw i32 %a, 7
+ ret i32 %res
+}
+
+define i64 @f3(i64 %a) nounwind readnone ssp {
+; CHECK-LABEL: f3:
+; CHECK-NEXT: add x0, x0, x0, lsl #4
+; CHECK-NEXT: ret
+ %res = mul nsw i64 %a, 17
+ ret i64 %res
+}
diff --git a/test/CodeGen/AArch64/arm64-arm64-dead-def-elimination-flag.ll b/test/CodeGen/AArch64/arm64-arm64-dead-def-elimination-flag.ll
new file mode 100644
index 0000000..0904b62
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-arm64-dead-def-elimination-flag.ll
@@ -0,0 +1,16 @@
+; RUN: llc -march=arm64 -aarch64-dead-def-elimination=false < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios7.0.0"
+
+; Function Attrs: nounwind ssp uwtable
+define i32 @test1() #0 {
+ %tmp1 = alloca i8
+ %tmp2 = icmp eq i8* %tmp1, null
+ %tmp3 = zext i1 %tmp2 to i32
+
+ ret i32 %tmp3
+
+ ; CHECK-LABEL: test1
+ ; CHECK: adds {{x[0-9]+}}, sp, #15
+}
diff --git a/test/CodeGen/AArch64/arm64-atomic-128.ll b/test/CodeGen/AArch64/arm64-atomic-128.ll
new file mode 100644
index 0000000..3b43aa1
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-atomic-128.ll
@@ -0,0 +1,225 @@
+; RUN: llc < %s -march=arm64 -mtriple=arm64-linux-gnu -verify-machineinstrs -mcpu=cyclone | FileCheck %s
+
+@var = global i128 0
+
+define i128 @val_compare_and_swap(i128* %p, i128 %oldval, i128 %newval) {
+; CHECK-LABEL: val_compare_and_swap:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp [[RESULTLO:x[0-9]+]], [[RESULTHI:x[0-9]+]], [x[[ADDR:[0-9]+]]]
+; CHECK-DAG: eor [[MISMATCH_LO:x[0-9]+]], [[RESULTLO]], x2
+; CHECK-DAG: eor [[MISMATCH_HI:x[0-9]+]], [[RESULTHI]], x3
+; CHECK: orr [[MISMATCH:x[0-9]+]], [[MISMATCH_LO]], [[MISMATCH_HI]]
+; CHECK: cbnz [[MISMATCH]], [[DONE:.LBB[0-9]+_[0-9]+]]
+; CHECK: stxp [[SCRATCH_RES:w[0-9]+]], x4, x5, [x[[ADDR]]]
+; CHECK: cbnz [[SCRATCH_RES]], [[LABEL]]
+; CHECK: [[DONE]]:
+ %val = cmpxchg i128* %p, i128 %oldval, i128 %newval acquire acquire
+ ret i128 %val
+}
+
+define void @fetch_and_nand(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_nand:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldxp [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
+; CHECK-DAG: bic [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2
+; CHECK-DAG: bic [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3
+; CHECK: stlxp [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
+; CHECK: cbnz [[SCRATCH_RES]], [[LABEL]]
+
+; CHECK-DAG: str [[DEST_REGHI]]
+; CHECK-DAG: str [[DEST_REGLO]]
+ %val = atomicrmw nand i128* %p, i128 %bits release
+ store i128 %val, i128* @var, align 16
+ ret void
+}
+
+define void @fetch_and_or(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_or:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
+; CHECK-DAG: orr [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2
+; CHECK-DAG: orr [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3
+; CHECK: stlxp [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
+; CHECK: cbnz [[SCRATCH_RES]], [[LABEL]]
+
+; CHECK-DAG: str [[DEST_REGHI]]
+; CHECK-DAG: str [[DEST_REGLO]]
+ %val = atomicrmw or i128* %p, i128 %bits seq_cst
+ store i128 %val, i128* @var, align 16
+ ret void
+}
+
+define void @fetch_and_add(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_add:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
+; CHECK: adds [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2
+; CHECK: adcs [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3
+; CHECK: stlxp [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
+; CHECK: cbnz [[SCRATCH_RES]], [[LABEL]]
+
+; CHECK-DAG: str [[DEST_REGHI]]
+; CHECK-DAG: str [[DEST_REGLO]]
+ %val = atomicrmw add i128* %p, i128 %bits seq_cst
+ store i128 %val, i128* @var, align 16
+ ret void
+}
+
+define void @fetch_and_sub(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_sub:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
+; CHECK: subs [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2
+; CHECK: sbcs [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3
+; CHECK: stlxp [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
+; CHECK: cbnz [[SCRATCH_RES]], [[LABEL]]
+
+; CHECK-DAG: str [[DEST_REGHI]]
+; CHECK-DAG: str [[DEST_REGLO]]
+ %val = atomicrmw sub i128* %p, i128 %bits seq_cst
+ store i128 %val, i128* @var, align 16
+ ret void
+}
+
+define void @fetch_and_min(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_min:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
+; CHECK: cmp [[DEST_REGLO]], x2
+; CHECK: cset [[LOCMP:w[0-9]+]], ls
+; CHECK: cmp [[DEST_REGHI:x[0-9]+]], x3
+; CHECK: cset [[HICMP:w[0-9]+]], le
+; CHECK: csel [[CMP:w[0-9]+]], [[LOCMP]], [[HICMP]], eq
+; CHECK: cmp [[CMP]], #0
+; CHECK-DAG: csel [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3, ne
+; CHECK-DAG: csel [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2, ne
+; CHECK: stlxp [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
+; CHECK: cbnz [[SCRATCH_RES]], [[LABEL]]
+
+; CHECK-DAG: str [[DEST_REGHI]]
+; CHECK-DAG: str [[DEST_REGLO]]
+ %val = atomicrmw min i128* %p, i128 %bits seq_cst
+ store i128 %val, i128* @var, align 16
+ ret void
+}
+
+define void @fetch_and_max(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_max:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
+; CHECK: cmp [[DEST_REGLO]], x2
+; CHECK: cset [[LOCMP:w[0-9]+]], hi
+; CHECK: cmp [[DEST_REGHI:x[0-9]+]], x3
+; CHECK: cset [[HICMP:w[0-9]+]], gt
+; CHECK: csel [[CMP:w[0-9]+]], [[LOCMP]], [[HICMP]], eq
+; CHECK: cmp [[CMP]], #0
+; CHECK-DAG: csel [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3, ne
+; CHECK-DAG: csel [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2, ne
+; CHECK: stlxp [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
+; CHECK: cbnz [[SCRATCH_RES]], [[LABEL]]
+
+; CHECK-DAG: str [[DEST_REGHI]]
+; CHECK-DAG: str [[DEST_REGLO]]
+ %val = atomicrmw max i128* %p, i128 %bits seq_cst
+ store i128 %val, i128* @var, align 16
+ ret void
+}
+
+define void @fetch_and_umin(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_umin:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
+; CHECK: cmp [[DEST_REGLO]], x2
+; CHECK: cset [[LOCMP:w[0-9]+]], ls
+; CHECK: cmp [[DEST_REGHI:x[0-9]+]], x3
+; CHECK: cset [[HICMP:w[0-9]+]], ls
+; CHECK: csel [[CMP:w[0-9]+]], [[LOCMP]], [[HICMP]], eq
+; CHECK: cmp [[CMP]], #0
+; CHECK-DAG: csel [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3, ne
+; CHECK-DAG: csel [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2, ne
+; CHECK: stlxp [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
+; CHECK: cbnz [[SCRATCH_RES]], [[LABEL]]
+
+; CHECK-DAG: str [[DEST_REGHI]]
+; CHECK-DAG: str [[DEST_REGLO]]
+ %val = atomicrmw umin i128* %p, i128 %bits seq_cst
+ store i128 %val, i128* @var, align 16
+ ret void
+}
+
+define void @fetch_and_umax(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_umax:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
+; CHECK: cmp [[DEST_REGLO]], x2
+; CHECK: cset [[LOCMP:w[0-9]+]], hi
+; CHECK: cmp [[DEST_REGHI:x[0-9]+]], x3
+; CHECK: cset [[HICMP:w[0-9]+]], hi
+; CHECK: csel [[CMP:w[0-9]+]], [[LOCMP]], [[HICMP]], eq
+; CHECK: cmp [[CMP]], #0
+; CHECK-DAG: csel [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3, ne
+; CHECK-DAG: csel [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2, ne
+; CHECK: stlxp [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
+; CHECK: cbnz [[SCRATCH_RES]], [[LABEL]]
+
+; CHECK-DAG: str [[DEST_REGHI]]
+; CHECK-DAG: str [[DEST_REGLO]]
+ %val = atomicrmw umax i128* %p, i128 %bits seq_cst
+ store i128 %val, i128* @var, align 16
+ ret void
+}
+
+define i128 @atomic_load_seq_cst(i128* %p) {
+; CHECK-LABEL: atomic_load_seq_cst:
+; CHECK-NOT: dmb
+; CHECK-LABEL: ldaxp
+; CHECK-NOT: dmb
+ %r = load atomic i128* %p seq_cst, align 16
+ ret i128 %r
+}
+
+define i128 @atomic_load_relaxed(i128* %p) {
+; CHECK-LABEL: atomic_load_relaxed:
+; CHECK-NOT: dmb
+; CHECK: ldxp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0]
+; CHECK-NOT: dmb
+ %r = load atomic i128* %p monotonic, align 16
+ ret i128 %r
+}
+
+
+define void @atomic_store_seq_cst(i128 %in, i128* %p) {
+; CHECK-LABEL: atomic_store_seq_cst:
+; CHECK-NOT: dmb
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp xzr, xzr, [x2]
+; CHECK: stlxp [[SUCCESS:w[0-9]+]], x0, x1, [x2]
+; CHECK: cbnz [[SUCCESS]], [[LABEL]]
+; CHECK-NOT: dmb
+ store atomic i128 %in, i128* %p seq_cst, align 16
+ ret void
+}
+
+define void @atomic_store_release(i128 %in, i128* %p) {
+; CHECK-LABEL: atomic_store_release:
+; CHECK-NOT: dmb
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldxp xzr, xzr, [x2]
+; CHECK: stlxp [[SUCCESS:w[0-9]+]], x0, x1, [x2]
+; CHECK: cbnz [[SUCCESS]], [[LABEL]]
+; CHECK-NOT: dmb
+ store atomic i128 %in, i128* %p release, align 16
+ ret void
+}
+
+define void @atomic_store_relaxed(i128 %in, i128* %p) {
+; CHECK-LABEL: atomic_store_relaxed:
+; CHECK-NOT: dmb
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldxp xzr, xzr, [x2]
+; CHECK: stxp [[SUCCESS:w[0-9]+]], x0, x1, [x2]
+; CHECK: cbnz [[SUCCESS]], [[LABEL]]
+; CHECK-NOT: dmb
+ store atomic i128 %in, i128* %p unordered, align 16
+ ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-atomic.ll b/test/CodeGen/AArch64/arm64-atomic.ll
new file mode 100644
index 0000000..aa9b284
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-atomic.ll
@@ -0,0 +1,331 @@
+; RUN: llc < %s -march=arm64 -verify-machineinstrs -mcpu=cyclone | FileCheck %s
+
+define i32 @val_compare_and_swap(i32* %p) {
+; CHECK-LABEL: val_compare_and_swap:
+; CHECK: orr [[NEWVAL_REG:w[0-9]+]], wzr, #0x4
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxr [[RESULT:w[0-9]+]], [x0]
+; CHECK: cmp [[RESULT]], #7
+; CHECK: b.ne [[LABEL2:.?LBB[0-9]+_[0-9]+]]
+; CHECK: stxr [[SCRATCH_REG:w[0-9]+]], [[NEWVAL_REG]], [x0]
+; CHECK: cbnz [[SCRATCH_REG]], [[LABEL]]
+; CHECK: [[LABEL2]]:
+ %val = cmpxchg i32* %p, i32 7, i32 4 acquire acquire
+ ret i32 %val
+}
+
+define i64 @val_compare_and_swap_64(i64* %p) {
+; CHECK-LABEL: val_compare_and_swap_64:
+; CHECK: orr w[[NEWVAL_REG:[0-9]+]], wzr, #0x4
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldxr [[RESULT:x[0-9]+]], [x0]
+; CHECK: cmp [[RESULT]], #7
+; CHECK: b.ne [[LABEL2:.?LBB[0-9]+_[0-9]+]]
+; CHECK-NOT: stxr x[[NEWVAL_REG]], x[[NEWVAL_REG]]
+; CHECK: stxr [[SCRATCH_REG:w[0-9]+]], x[[NEWVAL_REG]], [x0]
+; CHECK: cbnz [[SCRATCH_REG]], [[LABEL]]
+; CHECK: [[LABEL2]]:
+ %val = cmpxchg i64* %p, i64 7, i64 4 monotonic monotonic
+ ret i64 %val
+}
+
+define i32 @fetch_and_nand(i32* %p) {
+; CHECK-LABEL: fetch_and_nand:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldxr w[[DEST_REG:[0-9]+]], [x0]
+; CHECK: and [[SCRATCH2_REG:w[0-9]+]], w[[DEST_REG]], #0xfffffff8
+; CHECK-NOT: stlxr [[SCRATCH2_REG]], [[SCRATCH2_REG]]
+; CHECK: stlxr [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x0]
+; CHECK: cbnz [[SCRATCH_REG]], [[LABEL]]
+; CHECK: mov x0, x[[DEST_REG]]
+ %val = atomicrmw nand i32* %p, i32 7 release
+ ret i32 %val
+}
+
+define i64 @fetch_and_nand_64(i64* %p) {
+; CHECK-LABEL: fetch_and_nand_64:
+; CHECK: mov x[[ADDR:[0-9]+]], x0
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxr [[DEST_REG:x[0-9]+]], [x[[ADDR]]]
+; CHECK: and [[SCRATCH2_REG:x[0-9]+]], [[DEST_REG]], #0xfffffffffffffff8
+; CHECK: stlxr [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x[[ADDR]]]
+; CHECK: cbnz [[SCRATCH_REG]], [[LABEL]]
+
+ %val = atomicrmw nand i64* %p, i64 7 acq_rel
+ ret i64 %val
+}
+
+define i32 @fetch_and_or(i32* %p) {
+; CHECK-LABEL: fetch_and_or:
+; CHECK: movz [[OLDVAL_REG:w[0-9]+]], #0x5
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxr w[[DEST_REG:[0-9]+]], [x0]
+; CHECK: orr [[SCRATCH2_REG:w[0-9]+]], w[[DEST_REG]], [[OLDVAL_REG]]
+; CHECK-NOT: stlxr [[SCRATCH2_REG]], [[SCRATCH2_REG]]
+; CHECK: stlxr [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x0]
+; CHECK: cbnz [[SCRATCH_REG]], [[LABEL]]
+; CHECK: mov x0, x[[DEST_REG]]
+ %val = atomicrmw or i32* %p, i32 5 seq_cst
+ ret i32 %val
+}
+
+define i64 @fetch_and_or_64(i64* %p) {
+; CHECK: fetch_and_or_64:
+; CHECK: mov x[[ADDR:[0-9]+]], x0
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldxr [[DEST_REG:x[0-9]+]], [x[[ADDR]]]
+; CHECK: orr [[SCRATCH2_REG:x[0-9]+]], [[DEST_REG]], #0x7
+; CHECK: stxr [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x[[ADDR]]]
+; CHECK: cbnz [[SCRATCH_REG]], [[LABEL]]
+ %val = atomicrmw or i64* %p, i64 7 monotonic
+ ret i64 %val
+}
+
+define void @acquire_fence() {
+ fence acquire
+ ret void
+ ; CHECK-LABEL: acquire_fence:
+ ; CHECK: dmb ishld
+}
+
+define void @release_fence() {
+ fence release
+ ret void
+ ; CHECK-LABEL: release_fence:
+ ; CHECK: dmb ish{{$}}
+}
+
+define void @seq_cst_fence() {
+ fence seq_cst
+ ret void
+ ; CHECK-LABEL: seq_cst_fence:
+ ; CHECK: dmb ish{{$}}
+}
+
+define i32 @atomic_load(i32* %p) {
+ %r = load atomic i32* %p seq_cst, align 4
+ ret i32 %r
+ ; CHECK-LABEL: atomic_load:
+ ; CHECK: ldar
+}
+
+define i8 @atomic_load_relaxed_8(i8* %p, i32 %off32) {
+; CHECK-LABEL: atomic_load_relaxed_8:
+ %ptr_unsigned = getelementptr i8* %p, i32 4095
+ %val_unsigned = load atomic i8* %ptr_unsigned monotonic, align 1
+; CHECK: ldrb {{w[0-9]+}}, [x0, #4095]
+
+ %ptr_regoff = getelementptr i8* %p, i32 %off32
+ %val_regoff = load atomic i8* %ptr_regoff unordered, align 1
+ %tot1 = add i8 %val_unsigned, %val_regoff
+; CHECK: ldrb {{w[0-9]+}}, [x0, w1, sxtw]
+
+ %ptr_unscaled = getelementptr i8* %p, i32 -256
+ %val_unscaled = load atomic i8* %ptr_unscaled monotonic, align 1
+ %tot2 = add i8 %tot1, %val_unscaled
+; CHECK: ldurb {{w[0-9]+}}, [x0, #-256]
+
+ %ptr_random = getelementptr i8* %p, i32 1191936 ; 0x123000 (i.e. ADD imm)
+ %val_random = load atomic i8* %ptr_random unordered, align 1
+ %tot3 = add i8 %tot2, %val_random
+; CHECK: add x[[ADDR:[0-9]+]], x0, #291, lsl #12
+; CHECK: ldrb {{w[0-9]+}}, [x[[ADDR]]]
+
+ ret i8 %tot3
+}
+
+define i16 @atomic_load_relaxed_16(i16* %p, i32 %off32) {
+; CHECK-LABEL: atomic_load_relaxed_16:
+ %ptr_unsigned = getelementptr i16* %p, i32 4095
+ %val_unsigned = load atomic i16* %ptr_unsigned monotonic, align 2
+; CHECK: ldrh {{w[0-9]+}}, [x0, #8190]
+
+ %ptr_regoff = getelementptr i16* %p, i32 %off32
+ %val_regoff = load atomic i16* %ptr_regoff unordered, align 2
+ %tot1 = add i16 %val_unsigned, %val_regoff
+; CHECK: ldrh {{w[0-9]+}}, [x0, w1, sxtw #1]
+
+ %ptr_unscaled = getelementptr i16* %p, i32 -128
+ %val_unscaled = load atomic i16* %ptr_unscaled monotonic, align 2
+ %tot2 = add i16 %tot1, %val_unscaled
+; CHECK: ldurh {{w[0-9]+}}, [x0, #-256]
+
+ %ptr_random = getelementptr i16* %p, i32 595968 ; 0x123000/2 (i.e. ADD imm)
+ %val_random = load atomic i16* %ptr_random unordered, align 2
+ %tot3 = add i16 %tot2, %val_random
+; CHECK: add x[[ADDR:[0-9]+]], x0, #291, lsl #12
+; CHECK: ldrh {{w[0-9]+}}, [x[[ADDR]]]
+
+ ret i16 %tot3
+}
+
+define i32 @atomic_load_relaxed_32(i32* %p, i32 %off32) {
+; CHECK-LABEL: atomic_load_relaxed_32:
+ %ptr_unsigned = getelementptr i32* %p, i32 4095
+ %val_unsigned = load atomic i32* %ptr_unsigned monotonic, align 4
+; CHECK: ldr {{w[0-9]+}}, [x0, #16380]
+
+ %ptr_regoff = getelementptr i32* %p, i32 %off32
+ %val_regoff = load atomic i32* %ptr_regoff unordered, align 4
+ %tot1 = add i32 %val_unsigned, %val_regoff
+; CHECK: ldr {{w[0-9]+}}, [x0, w1, sxtw #2]
+
+ %ptr_unscaled = getelementptr i32* %p, i32 -64
+ %val_unscaled = load atomic i32* %ptr_unscaled monotonic, align 4
+ %tot2 = add i32 %tot1, %val_unscaled
+; CHECK: ldur {{w[0-9]+}}, [x0, #-256]
+
+ %ptr_random = getelementptr i32* %p, i32 297984 ; 0x123000/4 (i.e. ADD imm)
+ %val_random = load atomic i32* %ptr_random unordered, align 4
+ %tot3 = add i32 %tot2, %val_random
+; CHECK: add x[[ADDR:[0-9]+]], x0, #291, lsl #12
+; CHECK: ldr {{w[0-9]+}}, [x[[ADDR]]]
+
+ ret i32 %tot3
+}
+
+define i64 @atomic_load_relaxed_64(i64* %p, i32 %off32) {
+; CHECK-LABEL: atomic_load_relaxed_64:
+ %ptr_unsigned = getelementptr i64* %p, i32 4095
+ %val_unsigned = load atomic i64* %ptr_unsigned monotonic, align 8
+; CHECK: ldr {{x[0-9]+}}, [x0, #32760]
+
+ %ptr_regoff = getelementptr i64* %p, i32 %off32
+ %val_regoff = load atomic i64* %ptr_regoff unordered, align 8
+ %tot1 = add i64 %val_unsigned, %val_regoff
+; CHECK: ldr {{x[0-9]+}}, [x0, w1, sxtw #3]
+
+ %ptr_unscaled = getelementptr i64* %p, i32 -32
+ %val_unscaled = load atomic i64* %ptr_unscaled monotonic, align 8
+ %tot2 = add i64 %tot1, %val_unscaled
+; CHECK: ldur {{x[0-9]+}}, [x0, #-256]
+
+ %ptr_random = getelementptr i64* %p, i32 148992 ; 0x123000/8 (i.e. ADD imm)
+ %val_random = load atomic i64* %ptr_random unordered, align 8
+ %tot3 = add i64 %tot2, %val_random
+; CHECK: add x[[ADDR:[0-9]+]], x0, #291, lsl #12
+; CHECK: ldr {{x[0-9]+}}, [x[[ADDR]]]
+
+ ret i64 %tot3
+}
+
+
+define void @atomc_store(i32* %p) {
+ store atomic i32 4, i32* %p seq_cst, align 4
+ ret void
+ ; CHECK-LABEL: atomc_store:
+ ; CHECK: stlr
+}
+
+define void @atomic_store_relaxed_8(i8* %p, i32 %off32, i8 %val) {
+; CHECK-LABEL: atomic_store_relaxed_8:
+ %ptr_unsigned = getelementptr i8* %p, i32 4095
+ store atomic i8 %val, i8* %ptr_unsigned monotonic, align 1
+; CHECK: strb {{w[0-9]+}}, [x0, #4095]
+
+ %ptr_regoff = getelementptr i8* %p, i32 %off32
+ store atomic i8 %val, i8* %ptr_regoff unordered, align 1
+; CHECK: strb {{w[0-9]+}}, [x0, w1, sxtw]
+
+ %ptr_unscaled = getelementptr i8* %p, i32 -256
+ store atomic i8 %val, i8* %ptr_unscaled monotonic, align 1
+; CHECK: sturb {{w[0-9]+}}, [x0, #-256]
+
+ %ptr_random = getelementptr i8* %p, i32 1191936 ; 0x123000 (i.e. ADD imm)
+ store atomic i8 %val, i8* %ptr_random unordered, align 1
+; CHECK: add x[[ADDR:[0-9]+]], x0, #291, lsl #12
+; CHECK: strb {{w[0-9]+}}, [x[[ADDR]]]
+
+ ret void
+}
+
+define void @atomic_store_relaxed_16(i16* %p, i32 %off32, i16 %val) {
+; CHECK-LABEL: atomic_store_relaxed_16:
+ %ptr_unsigned = getelementptr i16* %p, i32 4095
+ store atomic i16 %val, i16* %ptr_unsigned monotonic, align 2
+; CHECK: strh {{w[0-9]+}}, [x0, #8190]
+
+ %ptr_regoff = getelementptr i16* %p, i32 %off32
+ store atomic i16 %val, i16* %ptr_regoff unordered, align 2
+; CHECK: strh {{w[0-9]+}}, [x0, w1, sxtw #1]
+
+ %ptr_unscaled = getelementptr i16* %p, i32 -128
+ store atomic i16 %val, i16* %ptr_unscaled monotonic, align 2
+; CHECK: sturh {{w[0-9]+}}, [x0, #-256]
+
+ %ptr_random = getelementptr i16* %p, i32 595968 ; 0x123000/2 (i.e. ADD imm)
+ store atomic i16 %val, i16* %ptr_random unordered, align 2
+; CHECK: add x[[ADDR:[0-9]+]], x0, #291, lsl #12
+; CHECK: strh {{w[0-9]+}}, [x[[ADDR]]]
+
+ ret void
+}
+
+define void @atomic_store_relaxed_32(i32* %p, i32 %off32, i32 %val) {
+; CHECK-LABEL: atomic_store_relaxed_32:
+ %ptr_unsigned = getelementptr i32* %p, i32 4095
+ store atomic i32 %val, i32* %ptr_unsigned monotonic, align 4
+; CHECK: str {{w[0-9]+}}, [x0, #16380]
+
+ %ptr_regoff = getelementptr i32* %p, i32 %off32
+ store atomic i32 %val, i32* %ptr_regoff unordered, align 4
+; CHECK: str {{w[0-9]+}}, [x0, w1, sxtw #2]
+
+ %ptr_unscaled = getelementptr i32* %p, i32 -64
+ store atomic i32 %val, i32* %ptr_unscaled monotonic, align 4
+; CHECK: stur {{w[0-9]+}}, [x0, #-256]
+
+ %ptr_random = getelementptr i32* %p, i32 297984 ; 0x123000/4 (i.e. ADD imm)
+ store atomic i32 %val, i32* %ptr_random unordered, align 4
+; CHECK: add x[[ADDR:[0-9]+]], x0, #291, lsl #12
+; CHECK: str {{w[0-9]+}}, [x[[ADDR]]]
+
+ ret void
+}
+
+define void @atomic_store_relaxed_64(i64* %p, i32 %off32, i64 %val) {
+; CHECK-LABEL: atomic_store_relaxed_64:
+ %ptr_unsigned = getelementptr i64* %p, i32 4095
+ store atomic i64 %val, i64* %ptr_unsigned monotonic, align 8
+; CHECK: str {{x[0-9]+}}, [x0, #32760]
+
+ %ptr_regoff = getelementptr i64* %p, i32 %off32
+ store atomic i64 %val, i64* %ptr_regoff unordered, align 8
+; CHECK: str {{x[0-9]+}}, [x0, w1, sxtw #3]
+
+ %ptr_unscaled = getelementptr i64* %p, i32 -32
+ store atomic i64 %val, i64* %ptr_unscaled monotonic, align 8
+; CHECK: stur {{x[0-9]+}}, [x0, #-256]
+
+ %ptr_random = getelementptr i64* %p, i32 148992 ; 0x123000/8 (i.e. ADD imm)
+ store atomic i64 %val, i64* %ptr_random unordered, align 8
+; CHECK: add x[[ADDR:[0-9]+]], x0, #291, lsl #12
+; CHECK: str {{x[0-9]+}}, [x[[ADDR]]]
+
+ ret void
+}
+
+; rdar://11531169
+; rdar://11531308
+
+%"class.X::Atomic" = type { %struct.x_atomic_t }
+%struct.x_atomic_t = type { i32 }
+
+@counter = external hidden global %"class.X::Atomic", align 4
+
+define i32 @next_id() nounwind optsize ssp align 2 {
+entry:
+ %0 = atomicrmw add i32* getelementptr inbounds (%"class.X::Atomic"* @counter, i64 0, i32 0, i32 0), i32 1 seq_cst
+ %add.i = add i32 %0, 1
+ %tobool = icmp eq i32 %add.i, 0
+ br i1 %tobool, label %if.else, label %return
+
+if.else: ; preds = %entry
+ %1 = atomicrmw add i32* getelementptr inbounds (%"class.X::Atomic"* @counter, i64 0, i32 0, i32 0), i32 1 seq_cst
+ %add.i2 = add i32 %1, 1
+ br label %return
+
+return: ; preds = %if.else, %entry
+ %retval.0 = phi i32 [ %add.i2, %if.else ], [ %add.i, %entry ]
+ ret i32 %retval.0
+}
diff --git a/test/CodeGen/AArch64/arm64-basic-pic.ll b/test/CodeGen/AArch64/arm64-basic-pic.ll
new file mode 100644
index 0000000..9fdb1e9
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-basic-pic.ll
@@ -0,0 +1,54 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs -relocation-model=pic %s -o - | FileCheck %s
+
+@var = global i32 0
+
+define i32 @get_globalvar() {
+; CHECK-LABEL: get_globalvar:
+
+ %val = load i32* @var
+; CHECK: adrp x[[GOTHI:[0-9]+]], :got:var
+; CHECK: ldr x[[GOTLOC:[0-9]+]], [x[[GOTHI]], :got_lo12:var]
+; CHECK: ldr w0, [x[[GOTLOC]]]
+
+ ret i32 %val
+}
+
+define i32* @get_globalvaraddr() {
+; CHECK-LABEL: get_globalvaraddr:
+
+ %val = load i32* @var
+; CHECK: adrp x[[GOTHI:[0-9]+]], :got:var
+; CHECK: ldr x0, [x[[GOTHI]], :got_lo12:var]
+
+ ret i32* @var
+}
+
+@hiddenvar = hidden global i32 0
+
+define i32 @get_hiddenvar() {
+; CHECK-LABEL: get_hiddenvar:
+
+ %val = load i32* @hiddenvar
+; CHECK: adrp x[[HI:[0-9]+]], hiddenvar
+; CHECK: ldr w0, [x[[HI]], :lo12:hiddenvar]
+
+ ret i32 %val
+}
+
+define i32* @get_hiddenvaraddr() {
+; CHECK-LABEL: get_hiddenvaraddr:
+
+ %val = load i32* @hiddenvar
+; CHECK: adrp [[HI:x[0-9]+]], hiddenvar
+; CHECK: add x0, [[HI]], :lo12:hiddenvar
+
+ ret i32* @hiddenvar
+}
+
+define void()* @get_func() {
+; CHECK-LABEL: get_func:
+
+ ret void()* bitcast(void()*()* @get_func to void()*)
+; CHECK: adrp x[[GOTHI:[0-9]+]], :got:get_func
+; CHECK: ldr x0, [x[[GOTHI]], :got_lo12:get_func]
+}
diff --git a/test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll b/test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll
new file mode 100644
index 0000000..f0e968b
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll
@@ -0,0 +1,1101 @@
+; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -O1 -o - | FileCheck %s
+; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -O0 -fast-isel=true -o - | FileCheck %s
+
+; CHECK-LABEL: test_i64_f64:
+define void @test_i64_f64(double* %p, i64* %q) {
+; CHECK: ldr
+; CHECK: str
+ %1 = load double* %p
+ %2 = fadd double %1, %1
+ %3 = bitcast double %2 to i64
+ %4 = add i64 %3, %3
+ store i64 %4, i64* %q
+ ret void
+}
+
+; CHECK-LABEL: test_i64_v1i64:
+define void @test_i64_v1i64(<1 x i64>* %p, i64* %q) {
+; CHECK: ldr
+; CHECK: str
+ %1 = load <1 x i64>* %p
+ %2 = add <1 x i64> %1, %1
+ %3 = bitcast <1 x i64> %2 to i64
+ %4 = add i64 %3, %3
+ store i64 %4, i64* %q
+ ret void
+}
+
+; CHECK-LABEL: test_i64_v2f32:
+define void @test_i64_v2f32(<2 x float>* %p, i64* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2s }
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: str
+ %1 = load <2 x float>* %p
+ %2 = fadd <2 x float> %1, %1
+ %3 = bitcast <2 x float> %2 to i64
+ %4 = add i64 %3, %3
+ store i64 %4, i64* %q
+ ret void
+}
+
+; CHECK-LABEL: test_i64_v2i32:
+define void @test_i64_v2i32(<2 x i32>* %p, i64* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2s }
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: str
+ %1 = load <2 x i32>* %p
+ %2 = add <2 x i32> %1, %1
+ %3 = bitcast <2 x i32> %2 to i64
+ %4 = add i64 %3, %3
+ store i64 %4, i64* %q
+ ret void
+}
+
+; CHECK-LABEL: test_i64_v4i16:
+define void @test_i64_v4i16(<4 x i16>* %p, i64* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.4h }
+; CHECK: rev64 v{{[0-9]+}}.4h
+; CHECK: str
+ %1 = load <4 x i16>* %p
+ %2 = add <4 x i16> %1, %1
+ %3 = bitcast <4 x i16> %2 to i64
+ %4 = add i64 %3, %3
+ store i64 %4, i64* %q
+ ret void
+}
+
+; CHECK-LABEL: test_i64_v8i8:
+define void @test_i64_v8i8(<8 x i8>* %p, i64* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.8b }
+; CHECK: rev64 v{{[0-9]+}}.8b
+; CHECK: str
+ %1 = load <8 x i8>* %p
+ %2 = add <8 x i8> %1, %1
+ %3 = bitcast <8 x i8> %2 to i64
+ %4 = add i64 %3, %3
+ store i64 %4, i64* %q
+ ret void
+}
+
+; CHECK-LABEL: test_f64_i64:
+define void @test_f64_i64(i64* %p, double* %q) {
+; CHECK: ldr
+; CHECK: str
+ %1 = load i64* %p
+ %2 = add i64 %1, %1
+ %3 = bitcast i64 %2 to double
+ %4 = fadd double %3, %3
+ store double %4, double* %q
+ ret void
+}
+
+; CHECK-LABEL: test_f64_v1i64:
+define void @test_f64_v1i64(<1 x i64>* %p, double* %q) {
+; CHECK: ldr
+; CHECK: str
+ %1 = load <1 x i64>* %p
+ %2 = add <1 x i64> %1, %1
+ %3 = bitcast <1 x i64> %2 to double
+ %4 = fadd double %3, %3
+ store double %4, double* %q
+ ret void
+}
+
+; CHECK-LABEL: test_f64_v2f32:
+define void @test_f64_v2f32(<2 x float>* %p, double* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2s }
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: str
+ %1 = load <2 x float>* %p
+ %2 = fadd <2 x float> %1, %1
+ %3 = bitcast <2 x float> %2 to double
+ %4 = fadd double %3, %3
+ store double %4, double* %q
+ ret void
+}
+
+; CHECK-LABEL: test_f64_v2i32:
+define void @test_f64_v2i32(<2 x i32>* %p, double* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2s }
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: str
+ %1 = load <2 x i32>* %p
+ %2 = add <2 x i32> %1, %1
+ %3 = bitcast <2 x i32> %2 to double
+ %4 = fadd double %3, %3
+ store double %4, double* %q
+ ret void
+}
+
+; CHECK-LABEL: test_f64_v4i16:
+define void @test_f64_v4i16(<4 x i16>* %p, double* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.4h }
+; CHECK: rev64 v{{[0-9]+}}.4h
+; CHECK: str
+ %1 = load <4 x i16>* %p
+ %2 = add <4 x i16> %1, %1
+ %3 = bitcast <4 x i16> %2 to double
+ %4 = fadd double %3, %3
+ store double %4, double* %q
+ ret void
+}
+
+; CHECK-LABEL: test_f64_v8i8:
+define void @test_f64_v8i8(<8 x i8>* %p, double* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.8b }
+; CHECK: rev64 v{{[0-9]+}}.8b
+; CHECK: str
+ %1 = load <8 x i8>* %p
+ %2 = add <8 x i8> %1, %1
+ %3 = bitcast <8 x i8> %2 to double
+ %4 = fadd double %3, %3
+ store double %4, double* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v1i64_i64:
+define void @test_v1i64_i64(i64* %p, <1 x i64>* %q) {
+; CHECK: ldr
+; CHECK: str
+ %1 = load i64* %p
+ %2 = add i64 %1, %1
+ %3 = bitcast i64 %2 to <1 x i64>
+ %4 = add <1 x i64> %3, %3
+ store <1 x i64> %4, <1 x i64>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v1i64_f64:
+define void @test_v1i64_f64(double* %p, <1 x i64>* %q) {
+; CHECK: ldr
+; CHECK: str
+ %1 = load double* %p
+ %2 = fadd double %1, %1
+ %3 = bitcast double %2 to <1 x i64>
+ %4 = add <1 x i64> %3, %3
+ store <1 x i64> %4, <1 x i64>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v1i64_v2f32:
+define void @test_v1i64_v2f32(<2 x float>* %p, <1 x i64>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2s }
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: str
+ %1 = load <2 x float>* %p
+ %2 = fadd <2 x float> %1, %1
+ %3 = bitcast <2 x float> %2 to <1 x i64>
+ %4 = add <1 x i64> %3, %3
+ store <1 x i64> %4, <1 x i64>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v1i64_v2i32:
+define void @test_v1i64_v2i32(<2 x i32>* %p, <1 x i64>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2s }
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: str
+ %1 = load <2 x i32>* %p
+ %2 = add <2 x i32> %1, %1
+ %3 = bitcast <2 x i32> %2 to <1 x i64>
+ %4 = add <1 x i64> %3, %3
+ store <1 x i64> %4, <1 x i64>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v1i64_v4i16:
+define void @test_v1i64_v4i16(<4 x i16>* %p, <1 x i64>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.4h }
+; CHECK: rev64 v{{[0-9]+}}.4h
+; CHECK: str
+ %1 = load <4 x i16>* %p
+ %2 = add <4 x i16> %1, %1
+ %3 = bitcast <4 x i16> %2 to <1 x i64>
+ %4 = add <1 x i64> %3, %3
+ store <1 x i64> %4, <1 x i64>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v1i64_v8i8:
+define void @test_v1i64_v8i8(<8 x i8>* %p, <1 x i64>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.8b }
+; CHECK: rev64 v{{[0-9]+}}.8b
+; CHECK: str
+ %1 = load <8 x i8>* %p
+ %2 = add <8 x i8> %1, %1
+ %3 = bitcast <8 x i8> %2 to <1 x i64>
+ %4 = add <1 x i64> %3, %3
+ store <1 x i64> %4, <1 x i64>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v2f32_i64:
+define void @test_v2f32_i64(i64* %p, <2 x float>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: st1 { v{{[0-9]+}}.2s }
+ %1 = load i64* %p
+ %2 = add i64 %1, %1
+ %3 = bitcast i64 %2 to <2 x float>
+ %4 = fadd <2 x float> %3, %3
+ store <2 x float> %4, <2 x float>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v2f32_f64:
+define void @test_v2f32_f64(double* %p, <2 x float>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: st1 { v{{[0-9]+}}.2s }
+ %1 = load double* %p
+ %2 = fadd double %1, %1
+ %3 = bitcast double %2 to <2 x float>
+ %4 = fadd <2 x float> %3, %3
+ store <2 x float> %4, <2 x float>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v2f32_v1i64:
+define void @test_v2f32_v1i64(<1 x i64>* %p, <2 x float>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: st1 { v{{[0-9]+}}.2s }
+ %1 = load <1 x i64>* %p
+ %2 = add <1 x i64> %1, %1
+ %3 = bitcast <1 x i64> %2 to <2 x float>
+ %4 = fadd <2 x float> %3, %3
+ store <2 x float> %4, <2 x float>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v2f32_v2i32:
+define void @test_v2f32_v2i32(<2 x i32>* %p, <2 x float>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2s }
+; CHECK: st1 { v{{[0-9]+}}.2s }
+ %1 = load <2 x i32>* %p
+ %2 = add <2 x i32> %1, %1
+ %3 = bitcast <2 x i32> %2 to <2 x float>
+ %4 = fadd <2 x float> %3, %3
+ store <2 x float> %4, <2 x float>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v2f32_v4i16:
+define void @test_v2f32_v4i16(<4 x i16>* %p, <2 x float>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.4h }
+; CHECK: rev32 v{{[0-9]+}}.4h
+; CHECK: st1 { v{{[0-9]+}}.2s }
+ %1 = load <4 x i16>* %p
+ %2 = add <4 x i16> %1, %1
+ %3 = bitcast <4 x i16> %2 to <2 x float>
+ %4 = fadd <2 x float> %3, %3
+ store <2 x float> %4, <2 x float>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v2f32_v8i8:
+define void @test_v2f32_v8i8(<8 x i8>* %p, <2 x float>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.8b }
+; CHECK: rev32 v{{[0-9]+}}.8b
+; CHECK: st1 { v{{[0-9]+}}.2s }
+ %1 = load <8 x i8>* %p
+ %2 = add <8 x i8> %1, %1
+ %3 = bitcast <8 x i8> %2 to <2 x float>
+ %4 = fadd <2 x float> %3, %3
+ store <2 x float> %4, <2 x float>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v2i32_i64:
+define void @test_v2i32_i64(i64* %p, <2 x i32>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: st1 { v{{[0-9]+}}.2s }
+ %1 = load i64* %p
+ %2 = add i64 %1, %1
+ %3 = bitcast i64 %2 to <2 x i32>
+ %4 = add <2 x i32> %3, %3
+ store <2 x i32> %4, <2 x i32>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v2i32_f64:
+define void @test_v2i32_f64(double* %p, <2 x i32>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: st1 { v{{[0-9]+}}.2s }
+ %1 = load double* %p
+ %2 = fadd double %1, %1
+ %3 = bitcast double %2 to <2 x i32>
+ %4 = add <2 x i32> %3, %3
+ store <2 x i32> %4, <2 x i32>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v2i32_v1i64:
+define void @test_v2i32_v1i64(<1 x i64>* %p, <2 x i32>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: st1 { v{{[0-9]+}}.2s }
+ %1 = load <1 x i64>* %p
+ %2 = add <1 x i64> %1, %1
+ %3 = bitcast <1 x i64> %2 to <2 x i32>
+ %4 = add <2 x i32> %3, %3
+ store <2 x i32> %4, <2 x i32>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v2i32_v2f32:
+define void @test_v2i32_v2f32(<2 x float>* %p, <2 x i32>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2s }
+; CHECK: st1 { v{{[0-9]+}}.2s }
+ %1 = load <2 x float>* %p
+ %2 = fadd <2 x float> %1, %1
+ %3 = bitcast <2 x float> %2 to <2 x i32>
+ %4 = add <2 x i32> %3, %3
+ store <2 x i32> %4, <2 x i32>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v2i32_v4i16:
+define void @test_v2i32_v4i16(<4 x i16>* %p, <2 x i32>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.4h }
+; CHECK: rev32 v{{[0-9]+}}.4h
+; CHECK: st1 { v{{[0-9]+}}.2s }
+ %1 = load <4 x i16>* %p
+ %2 = add <4 x i16> %1, %1
+ %3 = bitcast <4 x i16> %2 to <2 x i32>
+ %4 = add <2 x i32> %3, %3
+ store <2 x i32> %4, <2 x i32>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v2i32_v8i8:
+define void @test_v2i32_v8i8(<8 x i8>* %p, <2 x i32>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.8b }
+; CHECK: rev32 v{{[0-9]+}}.8b
+; CHECK: st1 { v{{[0-9]+}}.2s }
+ %1 = load <8 x i8>* %p
+ %2 = add <8 x i8> %1, %1
+ %3 = bitcast <8 x i8> %2 to <2 x i32>
+ %4 = add <2 x i32> %3, %3
+ store <2 x i32> %4, <2 x i32>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v4i16_i64:
+define void @test_v4i16_i64(i64* %p, <4 x i16>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.4h
+; CHECK: st1 { v{{[0-9]+}}.4h }
+ %1 = load i64* %p
+ %2 = add i64 %1, %1
+ %3 = bitcast i64 %2 to <4 x i16>
+ %4 = add <4 x i16> %3, %3
+ store <4 x i16> %4, <4 x i16>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v4i16_f64:
+define void @test_v4i16_f64(double* %p, <4 x i16>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.4h
+; CHECK: st1 { v{{[0-9]+}}.4h }
+ %1 = load double* %p
+ %2 = fadd double %1, %1
+ %3 = bitcast double %2 to <4 x i16>
+ %4 = add <4 x i16> %3, %3
+ store <4 x i16> %4, <4 x i16>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v4i16_v1i64:
+define void @test_v4i16_v1i64(<1 x i64>* %p, <4 x i16>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.4h
+; CHECK: st1 { v{{[0-9]+}}.4h }
+ %1 = load <1 x i64>* %p
+ %2 = add <1 x i64> %1, %1
+ %3 = bitcast <1 x i64> %2 to <4 x i16>
+ %4 = add <4 x i16> %3, %3
+ store <4 x i16> %4, <4 x i16>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v4i16_v2f32:
+define void @test_v4i16_v2f32(<2 x float>* %p, <4 x i16>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2s }
+; CHECK: rev32 v{{[0-9]+}}.4h
+; CHECK: st1 { v{{[0-9]+}}.4h }
+ %1 = load <2 x float>* %p
+ %2 = fadd <2 x float> %1, %1
+ %3 = bitcast <2 x float> %2 to <4 x i16>
+ %4 = add <4 x i16> %3, %3
+ store <4 x i16> %4, <4 x i16>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v4i16_v2i32:
+define void @test_v4i16_v2i32(<2 x i32>* %p, <4 x i16>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2s }
+; CHECK: rev32 v{{[0-9]+}}.4h
+; CHECK: st1 { v{{[0-9]+}}.4h }
+ %1 = load <2 x i32>* %p
+ %2 = add <2 x i32> %1, %1
+ %3 = bitcast <2 x i32> %2 to <4 x i16>
+ %4 = add <4 x i16> %3, %3
+ store <4 x i16> %4, <4 x i16>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v4i16_v8i8:
+define void @test_v4i16_v8i8(<8 x i8>* %p, <4 x i16>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.8b }
+; CHECK: rev16 v{{[0-9]+}}.8b
+; CHECK: st1 { v{{[0-9]+}}.4h }
+ %1 = load <8 x i8>* %p
+ %2 = add <8 x i8> %1, %1
+ %3 = bitcast <8 x i8> %2 to <4 x i16>
+ %4 = add <4 x i16> %3, %3
+ store <4 x i16> %4, <4 x i16>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v8i8_i64:
+define void @test_v8i8_i64(i64* %p, <8 x i8>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.8b
+; CHECK: st1 { v{{[0-9]+}}.8b }
+ %1 = load i64* %p
+ %2 = add i64 %1, %1
+ %3 = bitcast i64 %2 to <8 x i8>
+ %4 = add <8 x i8> %3, %3
+ store <8 x i8> %4, <8 x i8>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v8i8_f64:
+define void @test_v8i8_f64(double* %p, <8 x i8>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.8b
+; CHECK: st1 { v{{[0-9]+}}.8b }
+ %1 = load double* %p
+ %2 = fadd double %1, %1
+ %3 = bitcast double %2 to <8 x i8>
+ %4 = add <8 x i8> %3, %3
+ store <8 x i8> %4, <8 x i8>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v8i8_v1i64:
+define void @test_v8i8_v1i64(<1 x i64>* %p, <8 x i8>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.8b
+; CHECK: st1 { v{{[0-9]+}}.8b }
+ %1 = load <1 x i64>* %p
+ %2 = add <1 x i64> %1, %1
+ %3 = bitcast <1 x i64> %2 to <8 x i8>
+ %4 = add <8 x i8> %3, %3
+ store <8 x i8> %4, <8 x i8>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v8i8_v2f32:
+define void @test_v8i8_v2f32(<2 x float>* %p, <8 x i8>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2s }
+; CHECK: rev32 v{{[0-9]+}}.8b
+; CHECK: st1 { v{{[0-9]+}}.8b }
+ %1 = load <2 x float>* %p
+ %2 = fadd <2 x float> %1, %1
+ %3 = bitcast <2 x float> %2 to <8 x i8>
+ %4 = add <8 x i8> %3, %3
+ store <8 x i8> %4, <8 x i8>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v8i8_v2i32:
+define void @test_v8i8_v2i32(<2 x i32>* %p, <8 x i8>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2s }
+; CHECK: rev32 v{{[0-9]+}}.8b
+; CHECK: st1 { v{{[0-9]+}}.8b }
+ %1 = load <2 x i32>* %p
+ %2 = add <2 x i32> %1, %1
+ %3 = bitcast <2 x i32> %2 to <8 x i8>
+ %4 = add <8 x i8> %3, %3
+ store <8 x i8> %4, <8 x i8>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v8i8_v4i16:
+define void @test_v8i8_v4i16(<4 x i16>* %p, <8 x i8>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.4h }
+; CHECK: rev16 v{{[0-9]+}}.8b
+; CHECK: st1 { v{{[0-9]+}}.8b }
+ %1 = load <4 x i16>* %p
+ %2 = add <4 x i16> %1, %1
+ %3 = bitcast <4 x i16> %2 to <8 x i8>
+ %4 = add <8 x i8> %3, %3
+ store <8 x i8> %4, <8 x i8>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_f128_v2f64:
+define void @test_f128_v2f64(<2 x double>* %p, fp128* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: ext
+; CHECK: str
+ %1 = load <2 x double>* %p
+ %2 = fadd <2 x double> %1, %1
+ %3 = bitcast <2 x double> %2 to fp128
+ %4 = fadd fp128 %3, %3
+ store fp128 %4, fp128* %q
+ ret void
+}
+
+; CHECK-LABEL: test_f128_v2i64:
+define void @test_f128_v2i64(<2 x i64>* %p, fp128* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: ext
+; CHECK: str
+ %1 = load <2 x i64>* %p
+ %2 = add <2 x i64> %1, %1
+ %3 = bitcast <2 x i64> %2 to fp128
+ %4 = fadd fp128 %3, %3
+ store fp128 %4, fp128* %q
+ ret void
+}
+
+; CHECK-LABEL: test_f128_v4f32:
+define void @test_f128_v4f32(<4 x float>* %p, fp128* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: str q
+ %1 = load <4 x float>* %p
+ %2 = fadd <4 x float> %1, %1
+ %3 = bitcast <4 x float> %2 to fp128
+ %4 = fadd fp128 %3, %3
+ store fp128 %4, fp128* %q
+ ret void
+}
+
+; CHECK-LABEL: test_f128_v4i32:
+define void @test_f128_v4i32(<4 x i32>* %p, fp128* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.4s }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: str
+ %1 = load <4 x i32>* %p
+ %2 = add <4 x i32> %1, %1
+ %3 = bitcast <4 x i32> %2 to fp128
+ %4 = fadd fp128 %3, %3
+ store fp128 %4, fp128* %q
+ ret void
+}
+
+; CHECK-LABEL: test_f128_v8i16:
+define void @test_f128_v8i16(<8 x i16>* %p, fp128* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.8h }
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+; CHECK: str
+ %1 = load <8 x i16>* %p
+ %2 = add <8 x i16> %1, %1
+ %3 = bitcast <8 x i16> %2 to fp128
+ %4 = fadd fp128 %3, %3
+ store fp128 %4, fp128* %q
+ ret void
+}
+
+; CHECK-LABEL: test_f128_v16i8:
+define void @test_f128_v16i8(<16 x i8>* %p, fp128* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.16b }
+; CHECK: ext
+; CHECK: str q
+ %1 = load <16 x i8>* %p
+ %2 = add <16 x i8> %1, %1
+ %3 = bitcast <16 x i8> %2 to fp128
+ %4 = fadd fp128 %3, %3
+ store fp128 %4, fp128* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v2f64_f128:
+define void @test_v2f64_f128(fp128* %p, <2 x double>* %q) {
+; CHECK: ldr
+; CHECK: ext
+; CHECK: st1 { v{{[0-9]+}}.2d }
+ %1 = load fp128* %p
+ %2 = fadd fp128 %1, %1
+ %3 = bitcast fp128 %2 to <2 x double>
+ %4 = fadd <2 x double> %3, %3
+ store <2 x double> %4, <2 x double>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v2f64_v2i64:
+define void @test_v2f64_v2i64(<2 x i64>* %p, <2 x double>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: st1 { v{{[0-9]+}}.2d }
+ %1 = load <2 x i64>* %p
+ %2 = add <2 x i64> %1, %1
+ %3 = bitcast <2 x i64> %2 to <2 x double>
+ %4 = fadd <2 x double> %3, %3
+ store <2 x double> %4, <2 x double>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v2f64_v4f32:
+define void @test_v2f64_v4f32(<4 x float>* %p, <2 x double>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.2d }
+ %1 = load <4 x float>* %p
+ %2 = fadd <4 x float> %1, %1
+ %3 = bitcast <4 x float> %2 to <2 x double>
+ %4 = fadd <2 x double> %3, %3
+ store <2 x double> %4, <2 x double>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v2f64_v4i32:
+define void @test_v2f64_v4i32(<4 x i32>* %p, <2 x double>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.4s }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.2d }
+ %1 = load <4 x i32>* %p
+ %2 = add <4 x i32> %1, %1
+ %3 = bitcast <4 x i32> %2 to <2 x double>
+ %4 = fadd <2 x double> %3, %3
+ store <2 x double> %4, <2 x double>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v2f64_v8i16:
+define void @test_v2f64_v8i16(<8 x i16>* %p, <2 x double>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.8h }
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: st1 { v{{[0-9]+}}.2d }
+ %1 = load <8 x i16>* %p
+ %2 = add <8 x i16> %1, %1
+ %3 = bitcast <8 x i16> %2 to <2 x double>
+ %4 = fadd <2 x double> %3, %3
+ store <2 x double> %4, <2 x double>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v2f64_v16i8:
+define void @test_v2f64_v16i8(<16 x i8>* %p, <2 x double>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.16b }
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: st1 { v{{[0-9]+}}.2d }
+ %1 = load <16 x i8>* %p
+ %2 = add <16 x i8> %1, %1
+ %3 = bitcast <16 x i8> %2 to <2 x double>
+ %4 = fadd <2 x double> %3, %3
+ store <2 x double> %4, <2 x double>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v2i64_f128:
+define void @test_v2i64_f128(fp128* %p, <2 x i64>* %q) {
+; CHECK: ldr
+; CHECK: ext
+; CHECK: st1 { v{{[0-9]+}}.2d }
+ %1 = load fp128* %p
+ %2 = fadd fp128 %1, %1
+ %3 = bitcast fp128 %2 to <2 x i64>
+ %4 = add <2 x i64> %3, %3
+ store <2 x i64> %4, <2 x i64>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v2i64_v2f64:
+define void @test_v2i64_v2f64(<2 x double>* %p, <2 x i64>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: st1 { v{{[0-9]+}}.2d }
+ %1 = load <2 x double>* %p
+ %2 = fadd <2 x double> %1, %1
+ %3 = bitcast <2 x double> %2 to <2 x i64>
+ %4 = add <2 x i64> %3, %3
+ store <2 x i64> %4, <2 x i64>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v2i64_v4f32:
+define void @test_v2i64_v4f32(<4 x float>* %p, <2 x i64>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.2d }
+ %1 = load <4 x float>* %p
+ %2 = fadd <4 x float> %1, %1
+ %3 = bitcast <4 x float> %2 to <2 x i64>
+ %4 = add <2 x i64> %3, %3
+ store <2 x i64> %4, <2 x i64>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v2i64_v4i32:
+define void @test_v2i64_v4i32(<4 x i32>* %p, <2 x i64>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.4s }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.2d }
+ %1 = load <4 x i32>* %p
+ %2 = add <4 x i32> %1, %1
+ %3 = bitcast <4 x i32> %2 to <2 x i64>
+ %4 = add <2 x i64> %3, %3
+ store <2 x i64> %4, <2 x i64>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v2i64_v8i16:
+define void @test_v2i64_v8i16(<8 x i16>* %p, <2 x i64>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.8h }
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: st1 { v{{[0-9]+}}.2d }
+ %1 = load <8 x i16>* %p
+ %2 = add <8 x i16> %1, %1
+ %3 = bitcast <8 x i16> %2 to <2 x i64>
+ %4 = add <2 x i64> %3, %3
+ store <2 x i64> %4, <2 x i64>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v2i64_v16i8:
+define void @test_v2i64_v16i8(<16 x i8>* %p, <2 x i64>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.16b }
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: st1 { v{{[0-9]+}}.2d }
+ %1 = load <16 x i8>* %p
+ %2 = add <16 x i8> %1, %1
+ %3 = bitcast <16 x i8> %2 to <2 x i64>
+ %4 = add <2 x i64> %3, %3
+ store <2 x i64> %4, <2 x i64>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v4f32_f128:
+define void @test_v4f32_f128(fp128* %p, <4 x float>* %q) {
+; CHECK: ldr q
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.2d }
+ %1 = load fp128* %p
+ %2 = fadd fp128 %1, %1
+ %3 = bitcast fp128 %2 to <4 x float>
+ %4 = fadd <4 x float> %3, %3
+ store <4 x float> %4, <4 x float>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v4f32_v2f64:
+define void @test_v4f32_v2f64(<2 x double>* %p, <4 x float>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.2d }
+ %1 = load <2 x double>* %p
+ %2 = fadd <2 x double> %1, %1
+ %3 = bitcast <2 x double> %2 to <4 x float>
+ %4 = fadd <4 x float> %3, %3
+ store <4 x float> %4, <4 x float>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v4f32_v2i64:
+define void @test_v4f32_v2i64(<2 x i64>* %p, <4 x float>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.2d }
+ %1 = load <2 x i64>* %p
+ %2 = add <2 x i64> %1, %1
+ %3 = bitcast <2 x i64> %2 to <4 x float>
+ %4 = fadd <4 x float> %3, %3
+ store <4 x float> %4, <4 x float>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v4f32_v4i32:
+define void @test_v4f32_v4i32(<4 x i32>* %p, <4 x float>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.4s }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.2d }
+ %1 = load <4 x i32>* %p
+ %2 = add <4 x i32> %1, %1
+ %3 = bitcast <4 x i32> %2 to <4 x float>
+ %4 = fadd <4 x float> %3, %3
+ store <4 x float> %4, <4 x float>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v4f32_v8i16:
+define void @test_v4f32_v8i16(<8 x i16>* %p, <4 x float>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.8h }
+; CHECK: rev32 v{{[0-9]+}}.8h
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.2d }
+ %1 = load <8 x i16>* %p
+ %2 = add <8 x i16> %1, %1
+ %3 = bitcast <8 x i16> %2 to <4 x float>
+ %4 = fadd <4 x float> %3, %3
+ store <4 x float> %4, <4 x float>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v4f32_v16i8:
+define void @test_v4f32_v16i8(<16 x i8>* %p, <4 x float>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.16b }
+; CHECK: rev32 v{{[0-9]+}}.16b
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.2d }
+ %1 = load <16 x i8>* %p
+ %2 = add <16 x i8> %1, %1
+ %3 = bitcast <16 x i8> %2 to <4 x float>
+ %4 = fadd <4 x float> %3, %3
+ store <4 x float> %4, <4 x float>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v4i32_f128:
+define void @test_v4i32_f128(fp128* %p, <4 x i32>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: st1 { v{{[0-9]+}}.4s }
+ %1 = load fp128* %p
+ %2 = fadd fp128 %1, %1
+ %3 = bitcast fp128 %2 to <4 x i32>
+ %4 = add <4 x i32> %3, %3
+ store <4 x i32> %4, <4 x i32>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v4i32_v2f64:
+define void @test_v4i32_v2f64(<2 x double>* %p, <4 x i32>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.4s }
+ %1 = load <2 x double>* %p
+ %2 = fadd <2 x double> %1, %1
+ %3 = bitcast <2 x double> %2 to <4 x i32>
+ %4 = add <4 x i32> %3, %3
+ store <4 x i32> %4, <4 x i32>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v4i32_v2i64:
+define void @test_v4i32_v2i64(<2 x i64>* %p, <4 x i32>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.4s }
+ %1 = load <2 x i64>* %p
+ %2 = add <2 x i64> %1, %1
+ %3 = bitcast <2 x i64> %2 to <4 x i32>
+ %4 = add <4 x i32> %3, %3
+ store <4 x i32> %4, <4 x i32>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v4i32_v4f32:
+define void @test_v4i32_v4f32(<4 x float>* %p, <4 x i32>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: st1 { v{{[0-9]+}}.4s }
+ %1 = load <4 x float>* %p
+ %2 = fadd <4 x float> %1, %1
+ %3 = bitcast <4 x float> %2 to <4 x i32>
+ %4 = add <4 x i32> %3, %3
+ store <4 x i32> %4, <4 x i32>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v4i32_v8i16:
+define void @test_v4i32_v8i16(<8 x i16>* %p, <4 x i32>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.8h }
+; CHECK: rev32 v{{[0-9]+}}.8h
+; CHECK: st1 { v{{[0-9]+}}.4s }
+ %1 = load <8 x i16>* %p
+ %2 = add <8 x i16> %1, %1
+ %3 = bitcast <8 x i16> %2 to <4 x i32>
+ %4 = add <4 x i32> %3, %3
+ store <4 x i32> %4, <4 x i32>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v4i32_v16i8:
+define void @test_v4i32_v16i8(<16 x i8>* %p, <4 x i32>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.16b }
+; CHECK: rev32 v{{[0-9]+}}.16b
+; CHECK: st1 { v{{[0-9]+}}.4s }
+ %1 = load <16 x i8>* %p
+ %2 = add <16 x i8> %1, %1
+ %3 = bitcast <16 x i8> %2 to <4 x i32>
+ %4 = add <4 x i32> %3, %3
+ store <4 x i32> %4, <4 x i32>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v8i16_f128:
+define void @test_v8i16_f128(fp128* %p, <8 x i16>* %q) {
+; CHECK: ldr
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+; CHECK: st1 { v{{[0-9]+}}.8h }
+ %1 = load fp128* %p
+ %2 = fadd fp128 %1, %1
+ %3 = bitcast fp128 %2 to <8 x i16>
+ %4 = add <8 x i16> %3, %3
+ store <8 x i16> %4, <8 x i16>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v8i16_v2f64:
+define void @test_v8i16_v2f64(<2 x double>* %p, <8 x i16>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: st1 { v{{[0-9]+}}.8h }
+ %1 = load <2 x double>* %p
+ %2 = fadd <2 x double> %1, %1
+ %3 = bitcast <2 x double> %2 to <8 x i16>
+ %4 = add <8 x i16> %3, %3
+ store <8 x i16> %4, <8 x i16>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v8i16_v2i64:
+define void @test_v8i16_v2i64(<2 x i64>* %p, <8 x i16>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: st1 { v{{[0-9]+}}.8h }
+ %1 = load <2 x i64>* %p
+ %2 = add <2 x i64> %1, %1
+ %3 = bitcast <2 x i64> %2 to <8 x i16>
+ %4 = add <8 x i16> %3, %3
+ store <8 x i16> %4, <8 x i16>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v8i16_v4f32:
+define void @test_v8i16_v4f32(<4 x float>* %p, <8 x i16>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: rev32 v{{[0-9]+}}.8h
+; CHECK: st1 { v{{[0-9]+}}.8h }
+ %1 = load <4 x float>* %p
+ %2 = fadd <4 x float> %1, %1
+ %3 = bitcast <4 x float> %2 to <8 x i16>
+ %4 = add <8 x i16> %3, %3
+ store <8 x i16> %4, <8 x i16>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v8i16_v4i32:
+define void @test_v8i16_v4i32(<4 x i32>* %p, <8 x i16>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.4s }
+; CHECK: rev32 v{{[0-9]+}}.8h
+; CHECK: st1 { v{{[0-9]+}}.8h }
+ %1 = load <4 x i32>* %p
+ %2 = add <4 x i32> %1, %1
+ %3 = bitcast <4 x i32> %2 to <8 x i16>
+ %4 = add <8 x i16> %3, %3
+ store <8 x i16> %4, <8 x i16>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v8i16_v16i8:
+define void @test_v8i16_v16i8(<16 x i8>* %p, <8 x i16>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.16b }
+; CHECK: rev16 v{{[0-9]+}}.16b
+; CHECK: st1 { v{{[0-9]+}}.8h }
+ %1 = load <16 x i8>* %p
+ %2 = add <16 x i8> %1, %1
+ %3 = bitcast <16 x i8> %2 to <8 x i16>
+ %4 = add <8 x i16> %3, %3
+ store <8 x i16> %4, <8 x i16>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v16i8_f128:
+define void @test_v16i8_f128(fp128* %p, <16 x i8>* %q) {
+; CHECK: ldr q
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+; CHECK: st1 { v{{[0-9]+}}.16b }
+ %1 = load fp128* %p
+ %2 = fadd fp128 %1, %1
+ %3 = bitcast fp128 %2 to <16 x i8>
+ %4 = add <16 x i8> %3, %3
+ store <16 x i8> %4, <16 x i8>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v16i8_v2f64:
+define void @test_v16i8_v2f64(<2 x double>* %p, <16 x i8>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: st1 { v{{[0-9]+}}.16b }
+ %1 = load <2 x double>* %p
+ %2 = fadd <2 x double> %1, %1
+ %3 = bitcast <2 x double> %2 to <16 x i8>
+ %4 = add <16 x i8> %3, %3
+ store <16 x i8> %4, <16 x i8>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v16i8_v2i64:
+define void @test_v16i8_v2i64(<2 x i64>* %p, <16 x i8>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: st1 { v{{[0-9]+}}.16b }
+ %1 = load <2 x i64>* %p
+ %2 = add <2 x i64> %1, %1
+ %3 = bitcast <2 x i64> %2 to <16 x i8>
+ %4 = add <16 x i8> %3, %3
+ store <16 x i8> %4, <16 x i8>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v16i8_v4f32:
+define void @test_v16i8_v4f32(<4 x float>* %p, <16 x i8>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.2d }
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: rev32 v{{[0-9]+}}.16b
+; CHECK: st1 { v{{[0-9]+}}.16b }
+ %1 = load <4 x float>* %p
+ %2 = fadd <4 x float> %1, %1
+ %3 = bitcast <4 x float> %2 to <16 x i8>
+ %4 = add <16 x i8> %3, %3
+ store <16 x i8> %4, <16 x i8>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v16i8_v4i32:
+define void @test_v16i8_v4i32(<4 x i32>* %p, <16 x i8>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.4s }
+; CHECK: rev32 v{{[0-9]+}}.16b
+; CHECK: st1 { v{{[0-9]+}}.16b }
+ %1 = load <4 x i32>* %p
+ %2 = add <4 x i32> %1, %1
+ %3 = bitcast <4 x i32> %2 to <16 x i8>
+ %4 = add <16 x i8> %3, %3
+ store <16 x i8> %4, <16 x i8>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v16i8_v8i16:
+define void @test_v16i8_v8i16(<8 x i16>* %p, <16 x i8>* %q) {
+; CHECK: ld1 { v{{[0-9]+}}.8h }
+; CHECK: rev16 v{{[0-9]+}}.16b
+; CHECK: st1 { v{{[0-9]+}}.16b }
+ %1 = load <8 x i16>* %p
+ %2 = add <8 x i16> %1, %1
+ %3 = bitcast <8 x i16> %2 to <16 x i8>
+ %4 = add <16 x i8> %3, %3
+ store <16 x i8> %4, <16 x i8>* %q
+ ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-big-endian-eh.ll b/test/CodeGen/AArch64/arm64-big-endian-eh.ll
new file mode 100644
index 0000000..93e7da9
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-big-endian-eh.ll
@@ -0,0 +1,73 @@
+; RUN: llc -mtriple arm64_be-linux-gnu -filetype obj < %s | llvm-objdump -s - | FileCheck %s
+
+; ARM EHABI for big endian
+; This test case checks whether CIE length record is laid out in big endian format.
+;
+; This is the LLVM assembly generated from following C++ code:
+;
+; extern void foo(int);
+; void test(int a, int b) {
+; try {
+; foo(a);
+; } catch (...) {
+; foo(b);
+; }
+;}
+
+define void @_Z4testii(i32 %a, i32 %b) #0 {
+entry:
+ invoke void @_Z3fooi(i32 %a)
+ to label %try.cont unwind label %lpad
+
+lpad: ; preds = %entry
+ %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+ catch i8* null
+ %1 = extractvalue { i8*, i32 } %0, 0
+ %2 = tail call i8* @__cxa_begin_catch(i8* %1) #2
+ invoke void @_Z3fooi(i32 %b)
+ to label %invoke.cont2 unwind label %lpad1
+
+invoke.cont2: ; preds = %lpad
+ tail call void @__cxa_end_catch()
+ br label %try.cont
+
+try.cont: ; preds = %entry, %invoke.cont2
+ ret void
+
+lpad1: ; preds = %lpad
+ %3 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+ cleanup
+ invoke void @__cxa_end_catch()
+ to label %eh.resume unwind label %terminate.lpad
+
+eh.resume: ; preds = %lpad1
+ resume { i8*, i32 } %3
+
+terminate.lpad: ; preds = %lpad1
+ %4 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+ catch i8* null
+ %5 = extractvalue { i8*, i32 } %4, 0
+ tail call void @__clang_call_terminate(i8* %5) #3
+ unreachable
+}
+
+declare void @_Z3fooi(i32) #0
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @__cxa_end_catch()
+
+; Function Attrs: noinline noreturn nounwind
+define linkonce_odr hidden void @__clang_call_terminate(i8*) #1 {
+ %2 = tail call i8* @__cxa_begin_catch(i8* %0) #2
+ tail call void @_ZSt9terminatev() #3
+ unreachable
+}
+
+declare void @_ZSt9terminatev()
+
+; CHECK-LABEL: Contents of section .eh_frame:
+; CHECK-NEXT: 0000 0000001c
+
diff --git a/test/CodeGen/AArch64/arm64-big-endian-varargs.ll b/test/CodeGen/AArch64/arm64-big-endian-varargs.ll
new file mode 100644
index 0000000..d7b26b9
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-big-endian-varargs.ll
@@ -0,0 +1,58 @@
+; RUN: llc < %s | FileCheck %s
+
+; Vararg saving must save Q registers using the equivalent of STR/STP.
+
+target datalayout = "E-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "arm64_be-arm-none-eabi"
+
+%struct.__va_list = type { i8*, i8*, i8*, i32, i32 }
+
+declare void @llvm.va_start(i8*) nounwind
+declare void @llvm.va_end(i8*) nounwind
+
+define double @callee(i32 %a, ...) {
+; CHECK: stp
+; CHECK: stp
+; CHECK: stp
+; CHECK: stp
+; CHECK: stp
+; CHECK: stp
+entry:
+ %vl = alloca %struct.__va_list, align 8
+ %vl1 = bitcast %struct.__va_list* %vl to i8*
+ call void @llvm.va_start(i8* %vl1)
+ %vr_offs_p = getelementptr inbounds %struct.__va_list* %vl, i64 0, i32 4
+ %vr_offs = load i32* %vr_offs_p, align 4
+ %0 = icmp sgt i32 %vr_offs, -1
+ br i1 %0, label %vaarg.on_stack, label %vaarg.maybe_reg
+
+vaarg.maybe_reg: ; preds = %entry
+ %new_reg_offs = add i32 %vr_offs, 16
+ store i32 %new_reg_offs, i32* %vr_offs_p, align 4
+ %inreg = icmp slt i32 %new_reg_offs, 1
+ br i1 %inreg, label %vaarg.in_reg, label %vaarg.on_stack
+
+vaarg.in_reg: ; preds = %vaarg.maybe_reg
+ %reg_top_p = getelementptr inbounds %struct.__va_list* %vl, i64 0, i32 2
+ %reg_top = load i8** %reg_top_p, align 8
+ %1 = sext i32 %vr_offs to i64
+ %2 = getelementptr i8* %reg_top, i64 %1
+ %3 = ptrtoint i8* %2 to i64
+ %align_be = add i64 %3, 8
+ %4 = inttoptr i64 %align_be to i8*
+ br label %vaarg.end
+
+vaarg.on_stack: ; preds = %vaarg.maybe_reg, %entry
+ %stack_p = getelementptr inbounds %struct.__va_list* %vl, i64 0, i32 0
+ %stack = load i8** %stack_p, align 8
+ %new_stack = getelementptr i8* %stack, i64 8
+ store i8* %new_stack, i8** %stack_p, align 8
+ br label %vaarg.end
+
+vaarg.end: ; preds = %vaarg.on_stack, %vaarg.in_reg
+ %.sink = phi i8* [ %4, %vaarg.in_reg ], [ %stack, %vaarg.on_stack ]
+ %5 = bitcast i8* %.sink to double*
+ %6 = load double* %5, align 8
+ call void @llvm.va_end(i8* %vl1)
+ ret double %6
+}
diff --git a/test/CodeGen/AArch64/arm64-big-endian-vector-callee.ll b/test/CodeGen/AArch64/arm64-big-endian-vector-callee.ll
new file mode 100644
index 0000000..1dcccf1
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-big-endian-vector-callee.ll
@@ -0,0 +1,848 @@
+; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -o - | FileCheck %s
+; RUN: llc -mtriple arm64_be < %s -fast-isel=true -aarch64-load-store-opt=false -o - | FileCheck %s
+
+; CHECK-LABEL: test_i64_f64:
+define i64 @test_i64_f64(double %p) {
+; CHECK-NOT: rev
+ %1 = fadd double %p, %p
+ %2 = bitcast double %1 to i64
+ %3 = add i64 %2, %2
+ ret i64 %3
+}
+
+; CHECK-LABEL: test_i64_v1i64:
+define i64 @test_i64_v1i64(<1 x i64> %p) {
+; CHECK-NOT: rev
+ %1 = add <1 x i64> %p, %p
+ %2 = bitcast <1 x i64> %1 to i64
+ %3 = add i64 %2, %2
+ ret i64 %3
+}
+
+; CHECK-LABEL: test_i64_v2f32:
+define i64 @test_i64_v2f32(<2 x float> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+ %1 = fadd <2 x float> %p, %p
+ %2 = bitcast <2 x float> %1 to i64
+ %3 = add i64 %2, %2
+ ret i64 %3
+}
+
+; CHECK-LABEL: test_i64_v2i32:
+define i64 @test_i64_v2i32(<2 x i32> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+ %1 = add <2 x i32> %p, %p
+ %2 = bitcast <2 x i32> %1 to i64
+ %3 = add i64 %2, %2
+ ret i64 %3
+}
+
+; CHECK-LABEL: test_i64_v4i16:
+define i64 @test_i64_v4i16(<4 x i16> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+ %1 = add <4 x i16> %p, %p
+ %2 = bitcast <4 x i16> %1 to i64
+ %3 = add i64 %2, %2
+ ret i64 %3
+}
+
+; CHECK-LABEL: test_i64_v8i8:
+define i64 @test_i64_v8i8(<8 x i8> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+ %1 = add <8 x i8> %p, %p
+ %2 = bitcast <8 x i8> %1 to i64
+ %3 = add i64 %2, %2
+ ret i64 %3
+}
+
+; CHECK-LABEL: test_f64_i64:
+define double @test_f64_i64(i64 %p) {
+; CHECK-NOT: rev
+ %1 = add i64 %p, %p
+ %2 = bitcast i64 %1 to double
+ %3 = fadd double %2, %2
+ ret double %3
+}
+
+; CHECK-LABEL: test_f64_v1i64:
+define double @test_f64_v1i64(<1 x i64> %p) {
+; CHECK-NOT: rev
+ %1 = add <1 x i64> %p, %p
+ %2 = bitcast <1 x i64> %1 to double
+ %3 = fadd double %2, %2
+ ret double %3
+}
+
+; CHECK-LABEL: test_f64_v2f32:
+define double @test_f64_v2f32(<2 x float> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+ %1 = fadd <2 x float> %p, %p
+ %2 = bitcast <2 x float> %1 to double
+ %3 = fadd double %2, %2
+ ret double %3
+}
+
+; CHECK-LABEL: test_f64_v2i32:
+define double @test_f64_v2i32(<2 x i32> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+ %1 = add <2 x i32> %p, %p
+ %2 = bitcast <2 x i32> %1 to double
+ %3 = fadd double %2, %2
+ ret double %3
+}
+
+; CHECK-LABEL: test_f64_v4i16:
+define double @test_f64_v4i16(<4 x i16> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+ %1 = add <4 x i16> %p, %p
+ %2 = bitcast <4 x i16> %1 to double
+ %3 = fadd double %2, %2
+ ret double %3
+}
+
+; CHECK-LABEL: test_f64_v8i8:
+define double @test_f64_v8i8(<8 x i8> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+ %1 = add <8 x i8> %p, %p
+ %2 = bitcast <8 x i8> %1 to double
+ %3 = fadd double %2, %2
+ ret double %3
+}
+
+; CHECK-LABEL: test_v1i64_i64:
+define <1 x i64> @test_v1i64_i64(i64 %p) {
+; CHECK-NOT: rev
+ %1 = add i64 %p, %p
+ %2 = bitcast i64 %1 to <1 x i64>
+ %3 = add <1 x i64> %2, %2
+ ret <1 x i64> %3
+}
+
+; CHECK-LABEL: test_v1i64_f64:
+define <1 x i64> @test_v1i64_f64(double %p) {
+; CHECK-NOT: rev
+ %1 = fadd double %p, %p
+ %2 = bitcast double %1 to <1 x i64>
+ %3 = add <1 x i64> %2, %2
+ ret <1 x i64> %3
+}
+
+; CHECK-LABEL: test_v1i64_v2f32:
+define <1 x i64> @test_v1i64_v2f32(<2 x float> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+ %1 = fadd <2 x float> %p, %p
+ %2 = bitcast <2 x float> %1 to <1 x i64>
+ %3 = add <1 x i64> %2, %2
+ ret <1 x i64> %3
+}
+
+; CHECK-LABEL: test_v1i64_v2i32:
+define <1 x i64> @test_v1i64_v2i32(<2 x i32> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+ %1 = add <2 x i32> %p, %p
+ %2 = bitcast <2 x i32> %1 to <1 x i64>
+ %3 = add <1 x i64> %2, %2
+ ret <1 x i64> %3
+}
+
+; CHECK-LABEL: test_v1i64_v4i16:
+define <1 x i64> @test_v1i64_v4i16(<4 x i16> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+ %1 = add <4 x i16> %p, %p
+ %2 = bitcast <4 x i16> %1 to <1 x i64>
+ %3 = add <1 x i64> %2, %2
+ ret <1 x i64> %3
+}
+
+; CHECK-LABEL: test_v1i64_v8i8:
+define <1 x i64> @test_v1i64_v8i8(<8 x i8> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+ %1 = add <8 x i8> %p, %p
+ %2 = bitcast <8 x i8> %1 to <1 x i64>
+ %3 = add <1 x i64> %2, %2
+ ret <1 x i64> %3
+}
+
+; CHECK-LABEL: test_v2f32_i64:
+define <2 x float> @test_v2f32_i64(i64 %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+ %1 = add i64 %p, %p
+ %2 = bitcast i64 %1 to <2 x float>
+ %3 = fadd <2 x float> %2, %2
+ ret <2 x float> %3
+}
+
+; CHECK-LABEL: test_v2f32_f64:
+define <2 x float> @test_v2f32_f64(double %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+ %1 = fadd double %p, %p
+ %2 = bitcast double %1 to <2 x float>
+ %3 = fadd <2 x float> %2, %2
+ ret <2 x float> %3
+}
+
+; CHECK-LABEL: test_v2f32_v1i64:
+define <2 x float> @test_v2f32_v1i64(<1 x i64> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+ %1 = add <1 x i64> %p, %p
+ %2 = bitcast <1 x i64> %1 to <2 x float>
+ %3 = fadd <2 x float> %2, %2
+ ret <2 x float> %3
+}
+
+; CHECK-LABEL: test_v2f32_v2i32:
+define <2 x float> @test_v2f32_v2i32(<2 x i32> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: rev64 v{{[0-9]+}}.2s
+ %1 = add <2 x i32> %p, %p
+ %2 = bitcast <2 x i32> %1 to <2 x float>
+ %3 = fadd <2 x float> %2, %2
+ ret <2 x float> %3
+}
+
+; CHECK-LABEL: test_v2f32_v4i16:
+define <2 x float> @test_v2f32_v4i16(<4 x i16> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+; CHECK: rev64 v{{[0-9]+}}.2s
+ %1 = add <4 x i16> %p, %p
+ %2 = bitcast <4 x i16> %1 to <2 x float>
+ %3 = fadd <2 x float> %2, %2
+ ret <2 x float> %3
+}
+
+; CHECK-LABEL: test_v2f32_v8i8:
+define <2 x float> @test_v2f32_v8i8(<8 x i8> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+; CHECK: rev64 v{{[0-9]+}}.2s
+ %1 = add <8 x i8> %p, %p
+ %2 = bitcast <8 x i8> %1 to <2 x float>
+ %3 = fadd <2 x float> %2, %2
+ ret <2 x float> %3
+}
+
+; CHECK-LABEL: test_v2i32_i64:
+define <2 x i32> @test_v2i32_i64(i64 %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+ %1 = add i64 %p, %p
+ %2 = bitcast i64 %1 to <2 x i32>
+ %3 = add <2 x i32> %2, %2
+ ret <2 x i32> %3
+}
+
+; CHECK-LABEL: test_v2i32_f64:
+define <2 x i32> @test_v2i32_f64(double %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+ %1 = fadd double %p, %p
+ %2 = bitcast double %1 to <2 x i32>
+ %3 = add <2 x i32> %2, %2
+ ret <2 x i32> %3
+}
+
+; CHECK-LABEL: test_v2i32_v1i64:
+define <2 x i32> @test_v2i32_v1i64(<1 x i64> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+ %1 = add <1 x i64> %p, %p
+ %2 = bitcast <1 x i64> %1 to <2 x i32>
+ %3 = add <2 x i32> %2, %2
+ ret <2 x i32> %3
+}
+
+; CHECK-LABEL: test_v2i32_v2f32:
+define <2 x i32> @test_v2i32_v2f32(<2 x float> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: rev64 v{{[0-9]+}}.2s
+ %1 = fadd <2 x float> %p, %p
+ %2 = bitcast <2 x float> %1 to <2 x i32>
+ %3 = add <2 x i32> %2, %2
+ ret <2 x i32> %3
+}
+
+; CHECK-LABEL: test_v2i32_v4i16:
+define <2 x i32> @test_v2i32_v4i16(<4 x i16> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+; CHECK: rev64 v{{[0-9]+}}.2s
+ %1 = add <4 x i16> %p, %p
+ %2 = bitcast <4 x i16> %1 to <2 x i32>
+ %3 = add <2 x i32> %2, %2
+ ret <2 x i32> %3
+}
+
+; CHECK-LABEL: test_v2i32_v8i8:
+define <2 x i32> @test_v2i32_v8i8(<8 x i8> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+; CHECK: rev64 v{{[0-9]+}}.2s
+ %1 = add <8 x i8> %p, %p
+ %2 = bitcast <8 x i8> %1 to <2 x i32>
+ %3 = add <2 x i32> %2, %2
+ ret <2 x i32> %3
+}
+
+; CHECK-LABEL: test_v4i16_i64:
+define <4 x i16> @test_v4i16_i64(i64 %p) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+ %1 = add i64 %p, %p
+ %2 = bitcast i64 %1 to <4 x i16>
+ %3 = add <4 x i16> %2, %2
+ ret <4 x i16> %3
+}
+
+; CHECK-LABEL: test_v4i16_f64:
+define <4 x i16> @test_v4i16_f64(double %p) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+ %1 = fadd double %p, %p
+ %2 = bitcast double %1 to <4 x i16>
+ %3 = add <4 x i16> %2, %2
+ ret <4 x i16> %3
+}
+
+; CHECK-LABEL: test_v4i16_v1i64:
+define <4 x i16> @test_v4i16_v1i64(<1 x i64> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+ %1 = add <1 x i64> %p, %p
+ %2 = bitcast <1 x i64> %1 to <4 x i16>
+ %3 = add <4 x i16> %2, %2
+ ret <4 x i16> %3
+}
+
+; CHECK-LABEL: test_v4i16_v2f32:
+define <4 x i16> @test_v4i16_v2f32(<2 x float> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: rev64 v{{[0-9]+}}.4h
+ %1 = fadd <2 x float> %p, %p
+ %2 = bitcast <2 x float> %1 to <4 x i16>
+ %3 = add <4 x i16> %2, %2
+ ret <4 x i16> %3
+}
+
+; CHECK-LABEL: test_v4i16_v2i32:
+define <4 x i16> @test_v4i16_v2i32(<2 x i32> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: rev64 v{{[0-9]+}}.4h
+ %1 = add <2 x i32> %p, %p
+ %2 = bitcast <2 x i32> %1 to <4 x i16>
+ %3 = add <4 x i16> %2, %2
+ ret <4 x i16> %3
+}
+
+; CHECK-LABEL: test_v4i16_v8i8:
+define <4 x i16> @test_v4i16_v8i8(<8 x i8> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+; CHECK: rev64 v{{[0-9]+}}.4h
+ %1 = add <8 x i8> %p, %p
+ %2 = bitcast <8 x i8> %1 to <4 x i16>
+ %3 = add <4 x i16> %2, %2
+ ret <4 x i16> %3
+}
+
+; CHECK-LABEL: test_v8i8_i64:
+define <8 x i8> @test_v8i8_i64(i64 %p) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+ %1 = add i64 %p, %p
+ %2 = bitcast i64 %1 to <8 x i8>
+ %3 = add <8 x i8> %2, %2
+ ret <8 x i8> %3
+}
+
+; CHECK-LABEL: test_v8i8_f64:
+define <8 x i8> @test_v8i8_f64(double %p) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+ %1 = fadd double %p, %p
+ %2 = bitcast double %1 to <8 x i8>
+ %3 = add <8 x i8> %2, %2
+ ret <8 x i8> %3
+}
+
+; CHECK-LABEL: test_v8i8_v1i64:
+define <8 x i8> @test_v8i8_v1i64(<1 x i64> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+ %1 = add <1 x i64> %p, %p
+ %2 = bitcast <1 x i64> %1 to <8 x i8>
+ %3 = add <8 x i8> %2, %2
+ ret <8 x i8> %3
+}
+
+; CHECK-LABEL: test_v8i8_v2f32:
+define <8 x i8> @test_v8i8_v2f32(<2 x float> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: rev64 v{{[0-9]+}}.8b
+ %1 = fadd <2 x float> %p, %p
+ %2 = bitcast <2 x float> %1 to <8 x i8>
+ %3 = add <8 x i8> %2, %2
+ ret <8 x i8> %3
+}
+
+; CHECK-LABEL: test_v8i8_v2i32:
+define <8 x i8> @test_v8i8_v2i32(<2 x i32> %p) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: rev64 v{{[0-9]+}}.8b
+ %1 = add <2 x i32> %p, %p
+ %2 = bitcast <2 x i32> %1 to <8 x i8>
+ %3 = add <8 x i8> %2, %2
+ ret <8 x i8> %3
+}
+
+; CHECK-LABEL: test_v8i8_v4i16:
+define <8 x i8> @test_v8i8_v4i16(<4 x i16> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+; CHECK: rev64 v{{[0-9]+}}.8b
+ %1 = add <4 x i16> %p, %p
+ %2 = bitcast <4 x i16> %1 to <8 x i8>
+ %3 = add <8 x i8> %2, %2
+ ret <8 x i8> %3
+}
+
+; CHECK-LABEL: test_f128_v2f64:
+define fp128 @test_f128_v2f64(<2 x double> %p) {
+; CHECK: ext
+ %1 = fadd <2 x double> %p, %p
+ %2 = bitcast <2 x double> %1 to fp128
+ %3 = fadd fp128 %2, %2
+ ret fp128 %3
+}
+
+; CHECK-LABEL: test_f128_v2i64:
+define fp128 @test_f128_v2i64(<2 x i64> %p) {
+; CHECK: ext
+ %1 = add <2 x i64> %p, %p
+ %2 = bitcast <2 x i64> %1 to fp128
+ %3 = fadd fp128 %2, %2
+ ret fp128 %3
+}
+
+; CHECK-LABEL: test_f128_v4f32:
+define fp128 @test_f128_v4f32(<4 x float> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+ %1 = fadd <4 x float> %p, %p
+ %2 = bitcast <4 x float> %1 to fp128
+ %3 = fadd fp128 %2, %2
+ ret fp128 %3
+}
+
+; CHECK-LABEL: test_f128_v4i32:
+define fp128 @test_f128_v4i32(<4 x i32> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+ %1 = add <4 x i32> %p, %p
+ %2 = bitcast <4 x i32> %1 to fp128
+ %3 = fadd fp128 %2, %2
+ ret fp128 %3
+}
+
+; CHECK-LABEL: test_f128_v8i16:
+define fp128 @test_f128_v8i16(<8 x i16> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+ %1 = add <8 x i16> %p, %p
+ %2 = bitcast <8 x i16> %1 to fp128
+ %3 = fadd fp128 %2, %2
+ ret fp128 %3
+}
+
+; CHECK-LABEL: test_f128_v16i8:
+define fp128 @test_f128_v16i8(<16 x i8> %p) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+ %1 = add <16 x i8> %p, %p
+ %2 = bitcast <16 x i8> %1 to fp128
+ %3 = fadd fp128 %2, %2
+ ret fp128 %3
+}
+
+; CHECK-LABEL: test_v2f64_f128:
+define <2 x double> @test_v2f64_f128(fp128 %p) {
+; CHECK: ext
+ %1 = fadd fp128 %p, %p
+ %2 = bitcast fp128 %1 to <2 x double>
+ %3 = fadd <2 x double> %2, %2
+ ret <2 x double> %3
+}
+
+; CHECK-LABEL: test_v2f64_v2i64:
+define <2 x double> @test_v2f64_v2i64(<2 x i64> %p) {
+; CHECK: ext
+; CHECK: ext
+ %1 = add <2 x i64> %p, %p
+ %2 = bitcast <2 x i64> %1 to <2 x double>
+ %3 = fadd <2 x double> %2, %2
+ ret <2 x double> %3
+}
+
+; CHECK-LABEL: test_v2f64_v4f32:
+define <2 x double> @test_v2f64_v4f32(<4 x float> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: ext
+ %1 = fadd <4 x float> %p, %p
+ %2 = bitcast <4 x float> %1 to <2 x double>
+ %3 = fadd <2 x double> %2, %2
+ ret <2 x double> %3
+}
+
+; CHECK-LABEL: test_v2f64_v4i32:
+define <2 x double> @test_v2f64_v4i32(<4 x i32> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: ext
+ %1 = add <4 x i32> %p, %p
+ %2 = bitcast <4 x i32> %1 to <2 x double>
+ %3 = fadd <2 x double> %2, %2
+ ret <2 x double> %3
+}
+
+; CHECK-LABEL: test_v2f64_v8i16:
+define <2 x double> @test_v2f64_v8i16(<8 x i16> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+; CHECK: ext
+ %1 = add <8 x i16> %p, %p
+ %2 = bitcast <8 x i16> %1 to <2 x double>
+ %3 = fadd <2 x double> %2, %2
+ ret <2 x double> %3
+}
+
+; CHECK-LABEL: test_v2f64_v16i8:
+define <2 x double> @test_v2f64_v16i8(<16 x i8> %p) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+; CHECK: ext
+ %1 = add <16 x i8> %p, %p
+ %2 = bitcast <16 x i8> %1 to <2 x double>
+ %3 = fadd <2 x double> %2, %2
+ ret <2 x double> %3
+}
+
+; CHECK-LABEL: test_v2i64_f128:
+define <2 x i64> @test_v2i64_f128(fp128 %p) {
+; CHECK: ext
+ %1 = fadd fp128 %p, %p
+ %2 = bitcast fp128 %1 to <2 x i64>
+ %3 = add <2 x i64> %2, %2
+ ret <2 x i64> %3
+}
+
+; CHECK-LABEL: test_v2i64_v2f64:
+define <2 x i64> @test_v2i64_v2f64(<2 x double> %p) {
+; CHECK: ext
+; CHECK: ext
+ %1 = fadd <2 x double> %p, %p
+ %2 = bitcast <2 x double> %1 to <2 x i64>
+ %3 = add <2 x i64> %2, %2
+ ret <2 x i64> %3
+}
+
+; CHECK-LABEL: test_v2i64_v4f32:
+define <2 x i64> @test_v2i64_v4f32(<4 x float> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: ext
+ %1 = fadd <4 x float> %p, %p
+ %2 = bitcast <4 x float> %1 to <2 x i64>
+ %3 = add <2 x i64> %2, %2
+ ret <2 x i64> %3
+}
+
+; CHECK-LABEL: test_v2i64_v4i32:
+define <2 x i64> @test_v2i64_v4i32(<4 x i32> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: ext
+ %1 = add <4 x i32> %p, %p
+ %2 = bitcast <4 x i32> %1 to <2 x i64>
+ %3 = add <2 x i64> %2, %2
+ ret <2 x i64> %3
+}
+
+; CHECK-LABEL: test_v2i64_v8i16:
+define <2 x i64> @test_v2i64_v8i16(<8 x i16> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+; CHECK: ext
+ %1 = add <8 x i16> %p, %p
+ %2 = bitcast <8 x i16> %1 to <2 x i64>
+ %3 = add <2 x i64> %2, %2
+ ret <2 x i64> %3
+}
+
+; CHECK-LABEL: test_v2i64_v16i8:
+define <2 x i64> @test_v2i64_v16i8(<16 x i8> %p) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+; CHECK: ext
+ %1 = add <16 x i8> %p, %p
+ %2 = bitcast <16 x i8> %1 to <2 x i64>
+ %3 = add <2 x i64> %2, %2
+ ret <2 x i64> %3
+}
+
+; CHECK-LABEL: test_v4f32_f128:
+define <4 x float> @test_v4f32_f128(fp128 %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+ %1 = fadd fp128 %p, %p
+ %2 = bitcast fp128 %1 to <4 x float>
+ %3 = fadd <4 x float> %2, %2
+ ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_v4f32_v2f64:
+define <4 x float> @test_v4f32_v2f64(<2 x double> %p) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+ %1 = fadd <2 x double> %p, %p
+ %2 = bitcast <2 x double> %1 to <4 x float>
+ %3 = fadd <4 x float> %2, %2
+ ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_v4f32_v2i64:
+define <4 x float> @test_v4f32_v2i64(<2 x i64> %p) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+ %1 = add <2 x i64> %p, %p
+ %2 = bitcast <2 x i64> %1 to <4 x float>
+ %3 = fadd <4 x float> %2, %2
+ ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_v4f32_v4i32:
+define <4 x float> @test_v4f32_v4i32(<4 x i32> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+ %1 = add <4 x i32> %p, %p
+ %2 = bitcast <4 x i32> %1 to <4 x float>
+ %3 = fadd <4 x float> %2, %2
+ ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_v4f32_v8i16:
+define <4 x float> @test_v4f32_v8i16(<8 x i16> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+ %1 = add <8 x i16> %p, %p
+ %2 = bitcast <8 x i16> %1 to <4 x float>
+ %3 = fadd <4 x float> %2, %2
+ ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_v4f32_v16i8:
+define <4 x float> @test_v4f32_v16i8(<16 x i8> %p) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+ %1 = add <16 x i8> %p, %p
+ %2 = bitcast <16 x i8> %1 to <4 x float>
+ %3 = fadd <4 x float> %2, %2
+ ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_v4i32_f128:
+define <4 x i32> @test_v4i32_f128(fp128 %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+ %1 = fadd fp128 %p, %p
+ %2 = bitcast fp128 %1 to <4 x i32>
+ %3 = add <4 x i32> %2, %2
+ ret <4 x i32> %3
+}
+
+; CHECK-LABEL: test_v4i32_v2f64:
+define <4 x i32> @test_v4i32_v2f64(<2 x double> %p) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+ %1 = fadd <2 x double> %p, %p
+ %2 = bitcast <2 x double> %1 to <4 x i32>
+ %3 = add <4 x i32> %2, %2
+ ret <4 x i32> %3
+}
+
+; CHECK-LABEL: test_v4i32_v2i64:
+define <4 x i32> @test_v4i32_v2i64(<2 x i64> %p) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+ %1 = add <2 x i64> %p, %p
+ %2 = bitcast <2 x i64> %1 to <4 x i32>
+ %3 = add <4 x i32> %2, %2
+ ret <4 x i32> %3
+}
+
+; CHECK-LABEL: test_v4i32_v4f32:
+define <4 x i32> @test_v4i32_v4f32(<4 x float> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+ %1 = fadd <4 x float> %p, %p
+ %2 = bitcast <4 x float> %1 to <4 x i32>
+ %3 = add <4 x i32> %2, %2
+ ret <4 x i32> %3
+}
+
+; CHECK-LABEL: test_v4i32_v8i16:
+define <4 x i32> @test_v4i32_v8i16(<8 x i16> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+ %1 = add <8 x i16> %p, %p
+ %2 = bitcast <8 x i16> %1 to <4 x i32>
+ %3 = add <4 x i32> %2, %2
+ ret <4 x i32> %3
+}
+
+; CHECK-LABEL: test_v4i32_v16i8:
+define <4 x i32> @test_v4i32_v16i8(<16 x i8> %p) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+ %1 = add <16 x i8> %p, %p
+ %2 = bitcast <16 x i8> %1 to <4 x i32>
+ %3 = add <4 x i32> %2, %2
+ ret <4 x i32> %3
+}
+
+; CHECK-LABEL: test_v8i16_f128:
+define <8 x i16> @test_v8i16_f128(fp128 %p) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+ %1 = fadd fp128 %p, %p
+ %2 = bitcast fp128 %1 to <8 x i16>
+ %3 = add <8 x i16> %2, %2
+ ret <8 x i16> %3
+}
+
+; CHECK-LABEL: test_v8i16_v2f64:
+define <8 x i16> @test_v8i16_v2f64(<2 x double> %p) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+ %1 = fadd <2 x double> %p, %p
+ %2 = bitcast <2 x double> %1 to <8 x i16>
+ %3 = add <8 x i16> %2, %2
+ ret <8 x i16> %3
+}
+
+; CHECK-LABEL: test_v8i16_v2i64:
+define <8 x i16> @test_v8i16_v2i64(<2 x i64> %p) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+ %1 = add <2 x i64> %p, %p
+ %2 = bitcast <2 x i64> %1 to <8 x i16>
+ %3 = add <8 x i16> %2, %2
+ ret <8 x i16> %3
+}
+
+; CHECK-LABEL: test_v8i16_v4f32:
+define <8 x i16> @test_v8i16_v4f32(<4 x float> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+ %1 = fadd <4 x float> %p, %p
+ %2 = bitcast <4 x float> %1 to <8 x i16>
+ %3 = add <8 x i16> %2, %2
+ ret <8 x i16> %3
+}
+
+; CHECK-LABEL: test_v8i16_v4i32:
+define <8 x i16> @test_v8i16_v4i32(<4 x i32> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+ %1 = add <4 x i32> %p, %p
+ %2 = bitcast <4 x i32> %1 to <8 x i16>
+ %3 = add <8 x i16> %2, %2
+ ret <8 x i16> %3
+}
+
+; CHECK-LABEL: test_v8i16_v16i8:
+define <8 x i16> @test_v8i16_v16i8(<16 x i8> %p) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+ %1 = add <16 x i8> %p, %p
+ %2 = bitcast <16 x i8> %1 to <8 x i16>
+ %3 = add <8 x i16> %2, %2
+ ret <8 x i16> %3
+}
+
+; CHECK-LABEL: test_v16i8_f128:
+define <16 x i8> @test_v16i8_f128(fp128 %p) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+ %1 = fadd fp128 %p, %p
+ %2 = bitcast fp128 %1 to <16 x i8>
+ %3 = add <16 x i8> %2, %2
+ ret <16 x i8> %3
+}
+
+; CHECK-LABEL: test_v16i8_v2f64:
+define <16 x i8> @test_v16i8_v2f64(<2 x double> %p) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+ %1 = fadd <2 x double> %p, %p
+ %2 = bitcast <2 x double> %1 to <16 x i8>
+ %3 = add <16 x i8> %2, %2
+ ret <16 x i8> %3
+}
+
+; CHECK-LABEL: test_v16i8_v2i64:
+define <16 x i8> @test_v16i8_v2i64(<2 x i64> %p) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+ %1 = add <2 x i64> %p, %p
+ %2 = bitcast <2 x i64> %1 to <16 x i8>
+ %3 = add <16 x i8> %2, %2
+ ret <16 x i8> %3
+}
+
+; CHECK-LABEL: test_v16i8_v4f32:
+define <16 x i8> @test_v16i8_v4f32(<4 x float> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+ %1 = fadd <4 x float> %p, %p
+ %2 = bitcast <4 x float> %1 to <16 x i8>
+ %3 = add <16 x i8> %2, %2
+ ret <16 x i8> %3
+}
+
+; CHECK-LABEL: test_v16i8_v4i32:
+define <16 x i8> @test_v16i8_v4i32(<4 x i32> %p) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+ %1 = add <4 x i32> %p, %p
+ %2 = bitcast <4 x i32> %1 to <16 x i8>
+ %3 = add <16 x i8> %2, %2
+ ret <16 x i8> %3
+}
+
+; CHECK-LABEL: test_v16i8_v8i16:
+define <16 x i8> @test_v16i8_v8i16(<8 x i16> %p) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+ %1 = add <8 x i16> %p, %p
+ %2 = bitcast <8 x i16> %1 to <16 x i8>
+ %3 = add <16 x i8> %2, %2
+ ret <16 x i8> %3
+}
diff --git a/test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll b/test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll
new file mode 100644
index 0000000..9a12b7a
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll
@@ -0,0 +1,1100 @@
+; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -o - | FileCheck %s
+; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -fast-isel=true -O0 -o - | FileCheck %s
+
+; CHECK-LABEL: test_i64_f64:
+declare i64 @test_i64_f64_helper(double %p)
+define void @test_i64_f64(double* %p, i64* %q) {
+; CHECK-NOT: rev
+ %1 = load double* %p
+ %2 = fadd double %1, %1
+ %3 = call i64 @test_i64_f64_helper(double %2)
+ %4 = add i64 %3, %3
+ store i64 %4, i64* %q
+ ret void
+}
+
+; CHECK-LABEL: test_i64_v1i64:
+declare i64 @test_i64_v1i64_helper(<1 x i64> %p)
+define void @test_i64_v1i64(<1 x i64>* %p, i64* %q) {
+; CHECK-NOT: rev
+ %1 = load <1 x i64>* %p
+ %2 = add <1 x i64> %1, %1
+ %3 = call i64 @test_i64_v1i64_helper(<1 x i64> %2)
+ %4 = add i64 %3, %3
+ store i64 %4, i64* %q
+ ret void
+}
+
+; CHECK-LABEL: test_i64_v2f32:
+declare i64 @test_i64_v2f32_helper(<2 x float> %p)
+define void @test_i64_v2f32(<2 x float>* %p, i64* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+ %1 = load <2 x float>* %p
+ %2 = fadd <2 x float> %1, %1
+ %3 = call i64 @test_i64_v2f32_helper(<2 x float> %2)
+ %4 = add i64 %3, %3
+ store i64 %4, i64* %q
+ ret void
+}
+
+; CHECK-LABEL: test_i64_v2i32:
+declare i64 @test_i64_v2i32_helper(<2 x i32> %p)
+define void @test_i64_v2i32(<2 x i32>* %p, i64* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+ %1 = load <2 x i32>* %p
+ %2 = add <2 x i32> %1, %1
+ %3 = call i64 @test_i64_v2i32_helper(<2 x i32> %2)
+ %4 = add i64 %3, %3
+ store i64 %4, i64* %q
+ ret void
+}
+
+; CHECK-LABEL: test_i64_v4i16:
+declare i64 @test_i64_v4i16_helper(<4 x i16> %p)
+define void @test_i64_v4i16(<4 x i16>* %p, i64* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+ %1 = load <4 x i16>* %p
+ %2 = add <4 x i16> %1, %1
+ %3 = call i64 @test_i64_v4i16_helper(<4 x i16> %2)
+ %4 = add i64 %3, %3
+ store i64 %4, i64* %q
+ ret void
+}
+
+; CHECK-LABEL: test_i64_v8i8:
+declare i64 @test_i64_v8i8_helper(<8 x i8> %p)
+define void @test_i64_v8i8(<8 x i8>* %p, i64* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+ %1 = load <8 x i8>* %p
+ %2 = add <8 x i8> %1, %1
+ %3 = call i64 @test_i64_v8i8_helper(<8 x i8> %2)
+ %4 = add i64 %3, %3
+ store i64 %4, i64* %q
+ ret void
+}
+
+; CHECK-LABEL: test_f64_i64:
+declare double @test_f64_i64_helper(i64 %p)
+define void @test_f64_i64(i64* %p, double* %q) {
+; CHECK-NOT: rev
+ %1 = load i64* %p
+ %2 = add i64 %1, %1
+ %3 = call double @test_f64_i64_helper(i64 %2)
+ %4 = fadd double %3, %3
+ store double %4, double* %q
+ ret void
+}
+
+; CHECK-LABEL: test_f64_v1i64:
+declare double @test_f64_v1i64_helper(<1 x i64> %p)
+define void @test_f64_v1i64(<1 x i64>* %p, double* %q) {
+; CHECK-NOT: rev
+ %1 = load <1 x i64>* %p
+ %2 = add <1 x i64> %1, %1
+ %3 = call double @test_f64_v1i64_helper(<1 x i64> %2)
+ %4 = fadd double %3, %3
+ store double %4, double* %q
+ ret void
+}
+
+; CHECK-LABEL: test_f64_v2f32:
+declare double @test_f64_v2f32_helper(<2 x float> %p)
+define void @test_f64_v2f32(<2 x float>* %p, double* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+ %1 = load <2 x float>* %p
+ %2 = fadd <2 x float> %1, %1
+ %3 = call double @test_f64_v2f32_helper(<2 x float> %2)
+ %4 = fadd double %3, %3
+ store double %4, double* %q
+ ret void
+}
+
+; CHECK-LABEL: test_f64_v2i32:
+declare double @test_f64_v2i32_helper(<2 x i32> %p)
+define void @test_f64_v2i32(<2 x i32>* %p, double* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+ %1 = load <2 x i32>* %p
+ %2 = add <2 x i32> %1, %1
+ %3 = call double @test_f64_v2i32_helper(<2 x i32> %2)
+ %4 = fadd double %3, %3
+ store double %4, double* %q
+ ret void
+}
+
+; CHECK-LABEL: test_f64_v4i16:
+declare double @test_f64_v4i16_helper(<4 x i16> %p)
+define void @test_f64_v4i16(<4 x i16>* %p, double* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+ %1 = load <4 x i16>* %p
+ %2 = add <4 x i16> %1, %1
+ %3 = call double @test_f64_v4i16_helper(<4 x i16> %2)
+ %4 = fadd double %3, %3
+ store double %4, double* %q
+ ret void
+}
+
+; CHECK-LABEL: test_f64_v8i8:
+declare double @test_f64_v8i8_helper(<8 x i8> %p)
+define void @test_f64_v8i8(<8 x i8>* %p, double* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+ %1 = load <8 x i8>* %p
+ %2 = add <8 x i8> %1, %1
+ %3 = call double @test_f64_v8i8_helper(<8 x i8> %2)
+ %4 = fadd double %3, %3
+ store double %4, double* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v1i64_i64:
+declare <1 x i64> @test_v1i64_i64_helper(i64 %p)
+define void @test_v1i64_i64(i64* %p, <1 x i64>* %q) {
+; CHECK-NOT: rev
+ %1 = load i64* %p
+ %2 = add i64 %1, %1
+ %3 = call <1 x i64> @test_v1i64_i64_helper(i64 %2)
+ %4 = add <1 x i64> %3, %3
+ store <1 x i64> %4, <1 x i64>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v1i64_f64:
+declare <1 x i64> @test_v1i64_f64_helper(double %p)
+define void @test_v1i64_f64(double* %p, <1 x i64>* %q) {
+; CHECK-NOT: rev
+ %1 = load double* %p
+ %2 = fadd double %1, %1
+ %3 = call <1 x i64> @test_v1i64_f64_helper(double %2)
+ %4 = add <1 x i64> %3, %3
+ store <1 x i64> %4, <1 x i64>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v1i64_v2f32:
+declare <1 x i64> @test_v1i64_v2f32_helper(<2 x float> %p)
+define void @test_v1i64_v2f32(<2 x float>* %p, <1 x i64>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+ %1 = load <2 x float>* %p
+ %2 = fadd <2 x float> %1, %1
+ %3 = call <1 x i64> @test_v1i64_v2f32_helper(<2 x float> %2)
+ %4 = add <1 x i64> %3, %3
+ store <1 x i64> %4, <1 x i64>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v1i64_v2i32:
+declare <1 x i64> @test_v1i64_v2i32_helper(<2 x i32> %p)
+define void @test_v1i64_v2i32(<2 x i32>* %p, <1 x i64>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+ %1 = load <2 x i32>* %p
+ %2 = add <2 x i32> %1, %1
+ %3 = call <1 x i64> @test_v1i64_v2i32_helper(<2 x i32> %2)
+ %4 = add <1 x i64> %3, %3
+ store <1 x i64> %4, <1 x i64>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v1i64_v4i16:
+declare <1 x i64> @test_v1i64_v4i16_helper(<4 x i16> %p)
+define void @test_v1i64_v4i16(<4 x i16>* %p, <1 x i64>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+ %1 = load <4 x i16>* %p
+ %2 = add <4 x i16> %1, %1
+ %3 = call <1 x i64> @test_v1i64_v4i16_helper(<4 x i16> %2)
+ %4 = add <1 x i64> %3, %3
+ store <1 x i64> %4, <1 x i64>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v1i64_v8i8:
+declare <1 x i64> @test_v1i64_v8i8_helper(<8 x i8> %p)
+define void @test_v1i64_v8i8(<8 x i8>* %p, <1 x i64>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+ %1 = load <8 x i8>* %p
+ %2 = add <8 x i8> %1, %1
+ %3 = call <1 x i64> @test_v1i64_v8i8_helper(<8 x i8> %2)
+ %4 = add <1 x i64> %3, %3
+ store <1 x i64> %4, <1 x i64>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v2f32_i64:
+declare <2 x float> @test_v2f32_i64_helper(i64 %p)
+define void @test_v2f32_i64(i64* %p, <2 x float>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+ %1 = load i64* %p
+ %2 = add i64 %1, %1
+ %3 = call <2 x float> @test_v2f32_i64_helper(i64 %2)
+ %4 = fadd <2 x float> %3, %3
+ store <2 x float> %4, <2 x float>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v2f32_f64:
+declare <2 x float> @test_v2f32_f64_helper(double %p)
+define void @test_v2f32_f64(double* %p, <2 x float>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+ %1 = load double* %p
+ %2 = fadd double %1, %1
+ %3 = call <2 x float> @test_v2f32_f64_helper(double %2)
+ %4 = fadd <2 x float> %3, %3
+ store <2 x float> %4, <2 x float>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v2f32_v1i64:
+declare <2 x float> @test_v2f32_v1i64_helper(<1 x i64> %p)
+define void @test_v2f32_v1i64(<1 x i64>* %p, <2 x float>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+ %1 = load <1 x i64>* %p
+ %2 = add <1 x i64> %1, %1
+ %3 = call <2 x float> @test_v2f32_v1i64_helper(<1 x i64> %2)
+ %4 = fadd <2 x float> %3, %3
+ store <2 x float> %4, <2 x float>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v2f32_v2i32:
+declare <2 x float> @test_v2f32_v2i32_helper(<2 x i32> %p)
+define void @test_v2f32_v2i32(<2 x i32>* %p, <2 x float>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: rev64 v{{[0-9]+}}.2s
+ %1 = load <2 x i32>* %p
+ %2 = add <2 x i32> %1, %1
+ %3 = call <2 x float> @test_v2f32_v2i32_helper(<2 x i32> %2)
+ %4 = fadd <2 x float> %3, %3
+ store <2 x float> %4, <2 x float>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v2f32_v4i16:
+declare <2 x float> @test_v2f32_v4i16_helper(<4 x i16> %p)
+define void @test_v2f32_v4i16(<4 x i16>* %p, <2 x float>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+; CHECK: rev64 v{{[0-9]+}}.2s
+ %1 = load <4 x i16>* %p
+ %2 = add <4 x i16> %1, %1
+ %3 = call <2 x float> @test_v2f32_v4i16_helper(<4 x i16> %2)
+ %4 = fadd <2 x float> %3, %3
+ store <2 x float> %4, <2 x float>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v2f32_v8i8:
+declare <2 x float> @test_v2f32_v8i8_helper(<8 x i8> %p)
+define void @test_v2f32_v8i8(<8 x i8>* %p, <2 x float>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+; CHECK: rev64 v{{[0-9]+}}.2s
+ %1 = load <8 x i8>* %p
+ %2 = add <8 x i8> %1, %1
+ %3 = call <2 x float> @test_v2f32_v8i8_helper(<8 x i8> %2)
+ %4 = fadd <2 x float> %3, %3
+ store <2 x float> %4, <2 x float>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v2i32_i64:
+declare <2 x i32> @test_v2i32_i64_helper(i64 %p)
+define void @test_v2i32_i64(i64* %p, <2 x i32>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+ %1 = load i64* %p
+ %2 = add i64 %1, %1
+ %3 = call <2 x i32> @test_v2i32_i64_helper(i64 %2)
+ %4 = add <2 x i32> %3, %3
+ store <2 x i32> %4, <2 x i32>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v2i32_f64:
+declare <2 x i32> @test_v2i32_f64_helper(double %p)
+define void @test_v2i32_f64(double* %p, <2 x i32>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+ %1 = load double* %p
+ %2 = fadd double %1, %1
+ %3 = call <2 x i32> @test_v2i32_f64_helper(double %2)
+ %4 = add <2 x i32> %3, %3
+ store <2 x i32> %4, <2 x i32>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v2i32_v1i64:
+declare <2 x i32> @test_v2i32_v1i64_helper(<1 x i64> %p)
+define void @test_v2i32_v1i64(<1 x i64>* %p, <2 x i32>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+ %1 = load <1 x i64>* %p
+ %2 = add <1 x i64> %1, %1
+ %3 = call <2 x i32> @test_v2i32_v1i64_helper(<1 x i64> %2)
+ %4 = add <2 x i32> %3, %3
+ store <2 x i32> %4, <2 x i32>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v2i32_v2f32:
+declare <2 x i32> @test_v2i32_v2f32_helper(<2 x float> %p)
+define void @test_v2i32_v2f32(<2 x float>* %p, <2 x i32>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: rev64 v{{[0-9]+}}.2s
+ %1 = load <2 x float>* %p
+ %2 = fadd <2 x float> %1, %1
+ %3 = call <2 x i32> @test_v2i32_v2f32_helper(<2 x float> %2)
+ %4 = add <2 x i32> %3, %3
+ store <2 x i32> %4, <2 x i32>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v2i32_v4i16:
+declare <2 x i32> @test_v2i32_v4i16_helper(<4 x i16> %p)
+define void @test_v2i32_v4i16(<4 x i16>* %p, <2 x i32>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+; CHECK: rev64 v{{[0-9]+}}.2s
+ %1 = load <4 x i16>* %p
+ %2 = add <4 x i16> %1, %1
+ %3 = call <2 x i32> @test_v2i32_v4i16_helper(<4 x i16> %2)
+ %4 = add <2 x i32> %3, %3
+ store <2 x i32> %4, <2 x i32>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v2i32_v8i8:
+declare <2 x i32> @test_v2i32_v8i8_helper(<8 x i8> %p)
+define void @test_v2i32_v8i8(<8 x i8>* %p, <2 x i32>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+; CHECK: rev64 v{{[0-9]+}}.2s
+ %1 = load <8 x i8>* %p
+ %2 = add <8 x i8> %1, %1
+ %3 = call <2 x i32> @test_v2i32_v8i8_helper(<8 x i8> %2)
+ %4 = add <2 x i32> %3, %3
+ store <2 x i32> %4, <2 x i32>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v4i16_i64:
+declare <4 x i16> @test_v4i16_i64_helper(i64 %p)
+define void @test_v4i16_i64(i64* %p, <4 x i16>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+ %1 = load i64* %p
+ %2 = add i64 %1, %1
+ %3 = call <4 x i16> @test_v4i16_i64_helper(i64 %2)
+ %4 = add <4 x i16> %3, %3
+ store <4 x i16> %4, <4 x i16>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v4i16_f64:
+declare <4 x i16> @test_v4i16_f64_helper(double %p)
+define void @test_v4i16_f64(double* %p, <4 x i16>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+ %1 = load double* %p
+ %2 = fadd double %1, %1
+ %3 = call <4 x i16> @test_v4i16_f64_helper(double %2)
+ %4 = add <4 x i16> %3, %3
+ store <4 x i16> %4, <4 x i16>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v4i16_v1i64:
+declare <4 x i16> @test_v4i16_v1i64_helper(<1 x i64> %p)
+define void @test_v4i16_v1i64(<1 x i64>* %p, <4 x i16>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+ %1 = load <1 x i64>* %p
+ %2 = add <1 x i64> %1, %1
+ %3 = call <4 x i16> @test_v4i16_v1i64_helper(<1 x i64> %2)
+ %4 = add <4 x i16> %3, %3
+ store <4 x i16> %4, <4 x i16>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v4i16_v2f32:
+declare <4 x i16> @test_v4i16_v2f32_helper(<2 x float> %p)
+define void @test_v4i16_v2f32(<2 x float>* %p, <4 x i16>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: rev64 v{{[0-9]+}}.4h
+ %1 = load <2 x float>* %p
+ %2 = fadd <2 x float> %1, %1
+ %3 = call <4 x i16> @test_v4i16_v2f32_helper(<2 x float> %2)
+ %4 = add <4 x i16> %3, %3
+ store <4 x i16> %4, <4 x i16>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v4i16_v2i32:
+declare <4 x i16> @test_v4i16_v2i32_helper(<2 x i32> %p)
+define void @test_v4i16_v2i32(<2 x i32>* %p, <4 x i16>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: rev64 v{{[0-9]+}}.4h
+ %1 = load <2 x i32>* %p
+ %2 = add <2 x i32> %1, %1
+ %3 = call <4 x i16> @test_v4i16_v2i32_helper(<2 x i32> %2)
+ %4 = add <4 x i16> %3, %3
+ store <4 x i16> %4, <4 x i16>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v4i16_v8i8:
+declare <4 x i16> @test_v4i16_v8i8_helper(<8 x i8> %p)
+define void @test_v4i16_v8i8(<8 x i8>* %p, <4 x i16>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+; CHECK: rev64 v{{[0-9]+}}.4h
+ %1 = load <8 x i8>* %p
+ %2 = add <8 x i8> %1, %1
+ %3 = call <4 x i16> @test_v4i16_v8i8_helper(<8 x i8> %2)
+ %4 = add <4 x i16> %3, %3
+ store <4 x i16> %4, <4 x i16>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v8i8_i64:
+declare <8 x i8> @test_v8i8_i64_helper(i64 %p)
+define void @test_v8i8_i64(i64* %p, <8 x i8>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+ %1 = load i64* %p
+ %2 = add i64 %1, %1
+ %3 = call <8 x i8> @test_v8i8_i64_helper(i64 %2)
+ %4 = add <8 x i8> %3, %3
+ store <8 x i8> %4, <8 x i8>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v8i8_f64:
+declare <8 x i8> @test_v8i8_f64_helper(double %p)
+define void @test_v8i8_f64(double* %p, <8 x i8>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+ %1 = load double* %p
+ %2 = fadd double %1, %1
+ %3 = call <8 x i8> @test_v8i8_f64_helper(double %2)
+ %4 = add <8 x i8> %3, %3
+ store <8 x i8> %4, <8 x i8>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v8i8_v1i64:
+declare <8 x i8> @test_v8i8_v1i64_helper(<1 x i64> %p)
+define void @test_v8i8_v1i64(<1 x i64>* %p, <8 x i8>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8b
+ %1 = load <1 x i64>* %p
+ %2 = add <1 x i64> %1, %1
+ %3 = call <8 x i8> @test_v8i8_v1i64_helper(<1 x i64> %2)
+ %4 = add <8 x i8> %3, %3
+ store <8 x i8> %4, <8 x i8>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v8i8_v2f32:
+declare <8 x i8> @test_v8i8_v2f32_helper(<2 x float> %p)
+define void @test_v8i8_v2f32(<2 x float>* %p, <8 x i8>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: rev64 v{{[0-9]+}}.8b
+ %1 = load <2 x float>* %p
+ %2 = fadd <2 x float> %1, %1
+ %3 = call <8 x i8> @test_v8i8_v2f32_helper(<2 x float> %2)
+ %4 = add <8 x i8> %3, %3
+ store <8 x i8> %4, <8 x i8>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v8i8_v2i32:
+declare <8 x i8> @test_v8i8_v2i32_helper(<2 x i32> %p)
+define void @test_v8i8_v2i32(<2 x i32>* %p, <8 x i8>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.2s
+; CHECK: rev64 v{{[0-9]+}}.8b
+ %1 = load <2 x i32>* %p
+ %2 = add <2 x i32> %1, %1
+ %3 = call <8 x i8> @test_v8i8_v2i32_helper(<2 x i32> %2)
+ %4 = add <8 x i8> %3, %3
+ store <8 x i8> %4, <8 x i8>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v8i8_v4i16:
+declare <8 x i8> @test_v8i8_v4i16_helper(<4 x i16> %p)
+define void @test_v8i8_v4i16(<4 x i16>* %p, <8 x i8>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4h
+; CHECK: rev64 v{{[0-9]+}}.8b
+ %1 = load <4 x i16>* %p
+ %2 = add <4 x i16> %1, %1
+ %3 = call <8 x i8> @test_v8i8_v4i16_helper(<4 x i16> %2)
+ %4 = add <8 x i8> %3, %3
+ store <8 x i8> %4, <8 x i8>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_f128_v2f64:
+declare fp128 @test_f128_v2f64_helper(<2 x double> %p)
+define void @test_f128_v2f64(<2 x double>* %p, fp128* %q) {
+; CHECK: ext
+ %1 = load <2 x double>* %p
+ %2 = fadd <2 x double> %1, %1
+ %3 = call fp128 @test_f128_v2f64_helper(<2 x double> %2)
+ %4 = fadd fp128 %3, %3
+ store fp128 %4, fp128* %q
+ ret void
+}
+
+; CHECK-LABEL: test_f128_v2i64:
+declare fp128 @test_f128_v2i64_helper(<2 x i64> %p)
+define void @test_f128_v2i64(<2 x i64>* %p, fp128* %q) {
+; CHECK: ext
+ %1 = load <2 x i64>* %p
+ %2 = add <2 x i64> %1, %1
+ %3 = call fp128 @test_f128_v2i64_helper(<2 x i64> %2)
+ %4 = fadd fp128 %3, %3
+ store fp128 %4, fp128* %q
+ ret void
+}
+
+; CHECK-LABEL: test_f128_v4f32:
+declare fp128 @test_f128_v4f32_helper(<4 x float> %p)
+define void @test_f128_v4f32(<4 x float>* %p, fp128* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+ %1 = load <4 x float>* %p
+ %2 = fadd <4 x float> %1, %1
+ %3 = call fp128 @test_f128_v4f32_helper(<4 x float> %2)
+ %4 = fadd fp128 %3, %3
+ store fp128 %4, fp128* %q
+ ret void
+}
+
+; CHECK-LABEL: test_f128_v4i32:
+declare fp128 @test_f128_v4i32_helper(<4 x i32> %p)
+define void @test_f128_v4i32(<4 x i32>* %p, fp128* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+ %1 = load <4 x i32>* %p
+ %2 = add <4 x i32> %1, %1
+ %3 = call fp128 @test_f128_v4i32_helper(<4 x i32> %2)
+ %4 = fadd fp128 %3, %3
+ store fp128 %4, fp128* %q
+ ret void
+}
+
+; CHECK-LABEL: test_f128_v8i16:
+declare fp128 @test_f128_v8i16_helper(<8 x i16> %p)
+define void @test_f128_v8i16(<8 x i16>* %p, fp128* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+ %1 = load <8 x i16>* %p
+ %2 = add <8 x i16> %1, %1
+ %3 = call fp128 @test_f128_v8i16_helper(<8 x i16> %2)
+ %4 = fadd fp128 %3, %3
+ store fp128 %4, fp128* %q
+ ret void
+}
+
+; CHECK-LABEL: test_f128_v16i8:
+declare fp128 @test_f128_v16i8_helper(<16 x i8> %p)
+define void @test_f128_v16i8(<16 x i8>* %p, fp128* %q) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+ %1 = load <16 x i8>* %p
+ %2 = add <16 x i8> %1, %1
+ %3 = call fp128 @test_f128_v16i8_helper(<16 x i8> %2)
+ %4 = fadd fp128 %3, %3
+ store fp128 %4, fp128* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v2f64_f128:
+declare <2 x double> @test_v2f64_f128_helper(fp128 %p)
+define void @test_v2f64_f128(fp128* %p, <2 x double>* %q) {
+; CHECK: ext
+ %1 = load fp128* %p
+ %2 = fadd fp128 %1, %1
+ %3 = call <2 x double> @test_v2f64_f128_helper(fp128 %2)
+ %4 = fadd <2 x double> %3, %3
+ store <2 x double> %4, <2 x double>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v2f64_v2i64:
+declare <2 x double> @test_v2f64_v2i64_helper(<2 x i64> %p)
+define void @test_v2f64_v2i64(<2 x i64>* %p, <2 x double>* %q) {
+; CHECK: ext
+; CHECK: ext
+ %1 = load <2 x i64>* %p
+ %2 = add <2 x i64> %1, %1
+ %3 = call <2 x double> @test_v2f64_v2i64_helper(<2 x i64> %2)
+ %4 = fadd <2 x double> %3, %3
+ store <2 x double> %4, <2 x double>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v2f64_v4f32:
+declare <2 x double> @test_v2f64_v4f32_helper(<4 x float> %p)
+define void @test_v2f64_v4f32(<4 x float>* %p, <2 x double>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: ext
+ %1 = load <4 x float>* %p
+ %2 = fadd <4 x float> %1, %1
+ %3 = call <2 x double> @test_v2f64_v4f32_helper(<4 x float> %2)
+ %4 = fadd <2 x double> %3, %3
+ store <2 x double> %4, <2 x double>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v2f64_v4i32:
+declare <2 x double> @test_v2f64_v4i32_helper(<4 x i32> %p)
+define void @test_v2f64_v4i32(<4 x i32>* %p, <2 x double>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: ext
+ %1 = load <4 x i32>* %p
+ %2 = add <4 x i32> %1, %1
+ %3 = call <2 x double> @test_v2f64_v4i32_helper(<4 x i32> %2)
+ %4 = fadd <2 x double> %3, %3
+ store <2 x double> %4, <2 x double>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v2f64_v8i16:
+declare <2 x double> @test_v2f64_v8i16_helper(<8 x i16> %p)
+define void @test_v2f64_v8i16(<8 x i16>* %p, <2 x double>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+; CHECK: ext
+ %1 = load <8 x i16>* %p
+ %2 = add <8 x i16> %1, %1
+ %3 = call <2 x double> @test_v2f64_v8i16_helper(<8 x i16> %2)
+ %4 = fadd <2 x double> %3, %3
+ store <2 x double> %4, <2 x double>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v2f64_v16i8:
+declare <2 x double> @test_v2f64_v16i8_helper(<16 x i8> %p)
+define void @test_v2f64_v16i8(<16 x i8>* %p, <2 x double>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+; CHECK: ext
+ %1 = load <16 x i8>* %p
+ %2 = add <16 x i8> %1, %1
+ %3 = call <2 x double> @test_v2f64_v16i8_helper(<16 x i8> %2)
+ %4 = fadd <2 x double> %3, %3
+ store <2 x double> %4, <2 x double>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v2i64_f128:
+declare <2 x i64> @test_v2i64_f128_helper(fp128 %p)
+define void @test_v2i64_f128(fp128* %p, <2 x i64>* %q) {
+; CHECK: ext
+ %1 = load fp128* %p
+ %2 = fadd fp128 %1, %1
+ %3 = call <2 x i64> @test_v2i64_f128_helper(fp128 %2)
+ %4 = add <2 x i64> %3, %3
+ store <2 x i64> %4, <2 x i64>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v2i64_v2f64:
+declare <2 x i64> @test_v2i64_v2f64_helper(<2 x double> %p)
+define void @test_v2i64_v2f64(<2 x double>* %p, <2 x i64>* %q) {
+; CHECK: ext
+; CHECK: ext
+ %1 = load <2 x double>* %p
+ %2 = fadd <2 x double> %1, %1
+ %3 = call <2 x i64> @test_v2i64_v2f64_helper(<2 x double> %2)
+ %4 = add <2 x i64> %3, %3
+ store <2 x i64> %4, <2 x i64>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v2i64_v4f32:
+declare <2 x i64> @test_v2i64_v4f32_helper(<4 x float> %p)
+define void @test_v2i64_v4f32(<4 x float>* %p, <2 x i64>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: ext
+ %1 = load <4 x float>* %p
+ %2 = fadd <4 x float> %1, %1
+ %3 = call <2 x i64> @test_v2i64_v4f32_helper(<4 x float> %2)
+ %4 = add <2 x i64> %3, %3
+ store <2 x i64> %4, <2 x i64>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v2i64_v4i32:
+declare <2 x i64> @test_v2i64_v4i32_helper(<4 x i32> %p)
+define void @test_v2i64_v4i32(<4 x i32>* %p, <2 x i64>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: ext
+ %1 = load <4 x i32>* %p
+ %2 = add <4 x i32> %1, %1
+ %3 = call <2 x i64> @test_v2i64_v4i32_helper(<4 x i32> %2)
+ %4 = add <2 x i64> %3, %3
+ store <2 x i64> %4, <2 x i64>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v2i64_v8i16:
+declare <2 x i64> @test_v2i64_v8i16_helper(<8 x i16> %p)
+define void @test_v2i64_v8i16(<8 x i16>* %p, <2 x i64>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+; CHECK: ext
+ %1 = load <8 x i16>* %p
+ %2 = add <8 x i16> %1, %1
+ %3 = call <2 x i64> @test_v2i64_v8i16_helper(<8 x i16> %2)
+ %4 = add <2 x i64> %3, %3
+ store <2 x i64> %4, <2 x i64>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v2i64_v16i8:
+declare <2 x i64> @test_v2i64_v16i8_helper(<16 x i8> %p)
+define void @test_v2i64_v16i8(<16 x i8>* %p, <2 x i64>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+; CHECK: ext
+ %1 = load <16 x i8>* %p
+ %2 = add <16 x i8> %1, %1
+ %3 = call <2 x i64> @test_v2i64_v16i8_helper(<16 x i8> %2)
+ %4 = add <2 x i64> %3, %3
+ store <2 x i64> %4, <2 x i64>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v4f32_f128:
+declare <4 x float> @test_v4f32_f128_helper(fp128 %p)
+define void @test_v4f32_f128(fp128* %p, <4 x float>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+ %1 = load fp128* %p
+ %2 = fadd fp128 %1, %1
+ %3 = call <4 x float> @test_v4f32_f128_helper(fp128 %2)
+ %4 = fadd <4 x float> %3, %3
+ store <4 x float> %4, <4 x float>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v4f32_v2f64:
+declare <4 x float> @test_v4f32_v2f64_helper(<2 x double> %p)
+define void @test_v4f32_v2f64(<2 x double>* %p, <4 x float>* %q) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+ %1 = load <2 x double>* %p
+ %2 = fadd <2 x double> %1, %1
+ %3 = call <4 x float> @test_v4f32_v2f64_helper(<2 x double> %2)
+ %4 = fadd <4 x float> %3, %3
+ store <4 x float> %4, <4 x float>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v4f32_v2i64:
+declare <4 x float> @test_v4f32_v2i64_helper(<2 x i64> %p)
+define void @test_v4f32_v2i64(<2 x i64>* %p, <4 x float>* %q) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+ %1 = load <2 x i64>* %p
+ %2 = add <2 x i64> %1, %1
+ %3 = call <4 x float> @test_v4f32_v2i64_helper(<2 x i64> %2)
+ %4 = fadd <4 x float> %3, %3
+ store <4 x float> %4, <4 x float>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v4f32_v4i32:
+declare <4 x float> @test_v4f32_v4i32_helper(<4 x i32> %p)
+define void @test_v4f32_v4i32(<4 x i32>* %p, <4 x float>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+ %1 = load <4 x i32>* %p
+ %2 = add <4 x i32> %1, %1
+ %3 = call <4 x float> @test_v4f32_v4i32_helper(<4 x i32> %2)
+ %4 = fadd <4 x float> %3, %3
+ store <4 x float> %4, <4 x float>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v4f32_v8i16:
+declare <4 x float> @test_v4f32_v8i16_helper(<8 x i16> %p)
+define void @test_v4f32_v8i16(<8 x i16>* %p, <4 x float>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+ %1 = load <8 x i16>* %p
+ %2 = add <8 x i16> %1, %1
+ %3 = call <4 x float> @test_v4f32_v8i16_helper(<8 x i16> %2)
+ %4 = fadd <4 x float> %3, %3
+ store <4 x float> %4, <4 x float>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v4f32_v16i8:
+declare <4 x float> @test_v4f32_v16i8_helper(<16 x i8> %p)
+define void @test_v4f32_v16i8(<16 x i8>* %p, <4 x float>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+ %1 = load <16 x i8>* %p
+ %2 = add <16 x i8> %1, %1
+ %3 = call <4 x float> @test_v4f32_v16i8_helper(<16 x i8> %2)
+ %4 = fadd <4 x float> %3, %3
+ store <4 x float> %4, <4 x float>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v4i32_f128:
+declare <4 x i32> @test_v4i32_f128_helper(fp128 %p)
+define void @test_v4i32_f128(fp128* %p, <4 x i32>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+ %1 = load fp128* %p
+ %2 = fadd fp128 %1, %1
+ %3 = call <4 x i32> @test_v4i32_f128_helper(fp128 %2)
+ %4 = add <4 x i32> %3, %3
+ store <4 x i32> %4, <4 x i32>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v4i32_v2f64:
+declare <4 x i32> @test_v4i32_v2f64_helper(<2 x double> %p)
+define void @test_v4i32_v2f64(<2 x double>* %p, <4 x i32>* %q) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+ %1 = load <2 x double>* %p
+ %2 = fadd <2 x double> %1, %1
+ %3 = call <4 x i32> @test_v4i32_v2f64_helper(<2 x double> %2)
+ %4 = add <4 x i32> %3, %3
+ store <4 x i32> %4, <4 x i32>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v4i32_v2i64:
+declare <4 x i32> @test_v4i32_v2i64_helper(<2 x i64> %p)
+define void @test_v4i32_v2i64(<2 x i64>* %p, <4 x i32>* %q) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+ %1 = load <2 x i64>* %p
+ %2 = add <2 x i64> %1, %1
+ %3 = call <4 x i32> @test_v4i32_v2i64_helper(<2 x i64> %2)
+ %4 = add <4 x i32> %3, %3
+ store <4 x i32> %4, <4 x i32>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v4i32_v4f32:
+declare <4 x i32> @test_v4i32_v4f32_helper(<4 x float> %p)
+define void @test_v4i32_v4f32(<4 x float>* %p, <4 x i32>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+ %1 = load <4 x float>* %p
+ %2 = fadd <4 x float> %1, %1
+ %3 = call <4 x i32> @test_v4i32_v4f32_helper(<4 x float> %2)
+ %4 = add <4 x i32> %3, %3
+ store <4 x i32> %4, <4 x i32>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v4i32_v8i16:
+declare <4 x i32> @test_v4i32_v8i16_helper(<8 x i16> %p)
+define void @test_v4i32_v8i16(<8 x i16>* %p, <4 x i32>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+ %1 = load <8 x i16>* %p
+ %2 = add <8 x i16> %1, %1
+ %3 = call <4 x i32> @test_v4i32_v8i16_helper(<8 x i16> %2)
+ %4 = add <4 x i32> %3, %3
+ store <4 x i32> %4, <4 x i32>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v4i32_v16i8:
+declare <4 x i32> @test_v4i32_v16i8_helper(<16 x i8> %p)
+define void @test_v4i32_v16i8(<16 x i8>* %p, <4 x i32>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+ %1 = load <16 x i8>* %p
+ %2 = add <16 x i8> %1, %1
+ %3 = call <4 x i32> @test_v4i32_v16i8_helper(<16 x i8> %2)
+ %4 = add <4 x i32> %3, %3
+ store <4 x i32> %4, <4 x i32>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v8i16_f128:
+declare <8 x i16> @test_v8i16_f128_helper(fp128 %p)
+define void @test_v8i16_f128(fp128* %p, <8 x i16>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+ %1 = load fp128* %p
+ %2 = fadd fp128 %1, %1
+ %3 = call <8 x i16> @test_v8i16_f128_helper(fp128 %2)
+ %4 = add <8 x i16> %3, %3
+ store <8 x i16> %4, <8 x i16>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v8i16_v2f64:
+declare <8 x i16> @test_v8i16_v2f64_helper(<2 x double> %p)
+define void @test_v8i16_v2f64(<2 x double>* %p, <8 x i16>* %q) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+ %1 = load <2 x double>* %p
+ %2 = fadd <2 x double> %1, %1
+ %3 = call <8 x i16> @test_v8i16_v2f64_helper(<2 x double> %2)
+ %4 = add <8 x i16> %3, %3
+ store <8 x i16> %4, <8 x i16>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v8i16_v2i64:
+declare <8 x i16> @test_v8i16_v2i64_helper(<2 x i64> %p)
+define void @test_v8i16_v2i64(<2 x i64>* %p, <8 x i16>* %q) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+ %1 = load <2 x i64>* %p
+ %2 = add <2 x i64> %1, %1
+ %3 = call <8 x i16> @test_v8i16_v2i64_helper(<2 x i64> %2)
+ %4 = add <8 x i16> %3, %3
+ store <8 x i16> %4, <8 x i16>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v8i16_v4f32:
+declare <8 x i16> @test_v8i16_v4f32_helper(<4 x float> %p)
+define void @test_v8i16_v4f32(<4 x float>* %p, <8 x i16>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+ %1 = load <4 x float>* %p
+ %2 = fadd <4 x float> %1, %1
+ %3 = call <8 x i16> @test_v8i16_v4f32_helper(<4 x float> %2)
+ %4 = add <8 x i16> %3, %3
+ store <8 x i16> %4, <8 x i16>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v8i16_v4i32:
+declare <8 x i16> @test_v8i16_v4i32_helper(<4 x i32> %p)
+define void @test_v8i16_v4i32(<4 x i32>* %p, <8 x i16>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+ %1 = load <4 x i32>* %p
+ %2 = add <4 x i32> %1, %1
+ %3 = call <8 x i16> @test_v8i16_v4i32_helper(<4 x i32> %2)
+ %4 = add <8 x i16> %3, %3
+ store <8 x i16> %4, <8 x i16>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v8i16_v16i8:
+declare <8 x i16> @test_v8i16_v16i8_helper(<16 x i8> %p)
+define void @test_v8i16_v16i8(<16 x i8>* %p, <8 x i16>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+ %1 = load <16 x i8>* %p
+ %2 = add <16 x i8> %1, %1
+ %3 = call <8 x i16> @test_v8i16_v16i8_helper(<16 x i8> %2)
+ %4 = add <8 x i16> %3, %3
+ store <8 x i16> %4, <8 x i16>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v16i8_f128:
+declare <16 x i8> @test_v16i8_f128_helper(fp128 %p)
+define void @test_v16i8_f128(fp128* %p, <16 x i8>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+ %1 = load fp128* %p
+ %2 = fadd fp128 %1, %1
+ %3 = call <16 x i8> @test_v16i8_f128_helper(fp128 %2)
+ %4 = add <16 x i8> %3, %3
+ store <16 x i8> %4, <16 x i8>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v16i8_v2f64:
+declare <16 x i8> @test_v16i8_v2f64_helper(<2 x double> %p)
+define void @test_v16i8_v2f64(<2 x double>* %p, <16 x i8>* %q) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+ %1 = load <2 x double>* %p
+ %2 = fadd <2 x double> %1, %1
+ %3 = call <16 x i8> @test_v16i8_v2f64_helper(<2 x double> %2)
+ %4 = add <16 x i8> %3, %3
+ store <16 x i8> %4, <16 x i8>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v16i8_v2i64:
+declare <16 x i8> @test_v16i8_v2i64_helper(<2 x i64> %p)
+define void @test_v16i8_v2i64(<2 x i64>* %p, <16 x i8>* %q) {
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+ %1 = load <2 x i64>* %p
+ %2 = add <2 x i64> %1, %1
+ %3 = call <16 x i8> @test_v16i8_v2i64_helper(<2 x i64> %2)
+ %4 = add <16 x i8> %3, %3
+ store <16 x i8> %4, <16 x i8>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v16i8_v4f32:
+declare <16 x i8> @test_v16i8_v4f32_helper(<4 x float> %p)
+define void @test_v16i8_v4f32(<4 x float>* %p, <16 x i8>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+ %1 = load <4 x float>* %p
+ %2 = fadd <4 x float> %1, %1
+ %3 = call <16 x i8> @test_v16i8_v4f32_helper(<4 x float> %2)
+ %4 = add <16 x i8> %3, %3
+ store <16 x i8> %4, <16 x i8>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v16i8_v4i32:
+declare <16 x i8> @test_v16i8_v4i32_helper(<4 x i32> %p)
+define void @test_v16i8_v4i32(<4 x i32>* %p, <16 x i8>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.4s
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+ %1 = load <4 x i32>* %p
+ %2 = add <4 x i32> %1, %1
+ %3 = call <16 x i8> @test_v16i8_v4i32_helper(<4 x i32> %2)
+ %4 = add <16 x i8> %3, %3
+ store <16 x i8> %4, <16 x i8>* %q
+ ret void
+}
+
+; CHECK-LABEL: test_v16i8_v8i16:
+declare <16 x i8> @test_v16i8_v8i16_helper(<8 x i16> %p)
+define void @test_v16i8_v8i16(<8 x i16>* %p, <16 x i8>* %q) {
+; CHECK: rev64 v{{[0-9]+}}.8h
+; CHECK: ext
+; CHECK: rev64 v{{[0-9]+}}.16b
+; CHECK: ext
+ %1 = load <8 x i16>* %p
+ %2 = add <8 x i16> %1, %1
+ %3 = call <16 x i8> @test_v16i8_v8i16_helper(<8 x i16> %2)
+ %4 = add <16 x i8> %3, %3
+ store <16 x i8> %4, <16 x i8>* %q
+ ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-big-imm-offsets.ll b/test/CodeGen/AArch64/arm64-big-imm-offsets.ll
new file mode 100644
index 0000000..a56df07
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-big-imm-offsets.ll
@@ -0,0 +1,14 @@
+; RUN: llc -march=arm64 < %s
+
+
+; Make sure large offsets aren't mistaken for valid immediate offsets.
+; <rdar://problem/13190511>
+define void @f(i32* nocapture %p) {
+entry:
+ %a = ptrtoint i32* %p to i64
+ %ao = add i64 %a, 25769803792
+ %b = inttoptr i64 %ao to i32*
+ store volatile i32 0, i32* %b, align 4
+ store volatile i32 0, i32* %b, align 4
+ ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-big-stack.ll b/test/CodeGen/AArch64/arm64-big-stack.ll
new file mode 100644
index 0000000..3f91bb3c2
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-big-stack.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s | FileCheck %s
+target triple = "arm64-apple-macosx10"
+
+; Check that big stacks are generated correctly.
+; Currently, this is done by a sequence of sub instructions,
+; which can encode immediate with a 12 bits mask an optionally
+; shift left (up to 12). I.e., 16773120 is the biggest value.
+; <rdar://12513931>
+; CHECK-LABEL: foo:
+; CHECK: sub sp, sp, #4095, lsl #12
+; CHECK: sub sp, sp, #4095, lsl #12
+; CHECK: sub sp, sp, #2, lsl #12
+define void @foo() nounwind ssp {
+entry:
+ %buffer = alloca [33554432 x i8], align 1
+ %arraydecay = getelementptr inbounds [33554432 x i8]* %buffer, i64 0, i64 0
+ call void @doit(i8* %arraydecay) nounwind
+ ret void
+}
+
+declare void @doit(i8*)
diff --git a/test/CodeGen/AArch64/arm64-bitfield-extract.ll b/test/CodeGen/AArch64/arm64-bitfield-extract.ll
new file mode 100644
index 0000000..112efdd
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-bitfield-extract.ll
@@ -0,0 +1,532 @@
+; RUN: opt -codegenprepare -mtriple=arm64-apple=ios -S -o - %s | FileCheck --check-prefix=OPT %s
+; RUN: llc < %s -march=arm64 | FileCheck %s
+%struct.X = type { i8, i8, [2 x i8] }
+%struct.Y = type { i32, i8 }
+%struct.Z = type { i8, i8, [2 x i8], i16 }
+%struct.A = type { i64, i8 }
+
+define void @foo(%struct.X* nocapture %x, %struct.Y* nocapture %y) nounwind optsize ssp {
+; CHECK-LABEL: foo:
+; CHECK: ubfx
+; CHECK-NOT: and
+; CHECK: ret
+
+ %tmp = bitcast %struct.X* %x to i32*
+ %tmp1 = load i32* %tmp, align 4
+ %b = getelementptr inbounds %struct.Y* %y, i64 0, i32 1
+ %bf.clear = lshr i32 %tmp1, 3
+ %bf.clear.lobit = and i32 %bf.clear, 1
+ %frombool = trunc i32 %bf.clear.lobit to i8
+ store i8 %frombool, i8* %b, align 1
+ ret void
+}
+
+define i32 @baz(i64 %cav1.coerce) nounwind {
+; CHECK-LABEL: baz:
+; CHECK: sbfx w0, w0, #0, #4
+ %tmp = trunc i64 %cav1.coerce to i32
+ %tmp1 = shl i32 %tmp, 28
+ %bf.val.sext = ashr exact i32 %tmp1, 28
+ ret i32 %bf.val.sext
+}
+
+define i32 @bar(i64 %cav1.coerce) nounwind {
+; CHECK-LABEL: bar:
+; CHECK: sbfx w0, w0, #4, #6
+ %tmp = trunc i64 %cav1.coerce to i32
+ %cav1.sroa.0.1.insert = shl i32 %tmp, 22
+ %tmp1 = ashr i32 %cav1.sroa.0.1.insert, 26
+ ret i32 %tmp1
+}
+
+define void @fct1(%struct.Z* nocapture %x, %struct.A* nocapture %y) nounwind optsize ssp {
+; CHECK-LABEL: fct1:
+; CHECK: ubfx
+; CHECK-NOT: and
+; CHECK: ret
+
+ %tmp = bitcast %struct.Z* %x to i64*
+ %tmp1 = load i64* %tmp, align 4
+ %b = getelementptr inbounds %struct.A* %y, i64 0, i32 0
+ %bf.clear = lshr i64 %tmp1, 3
+ %bf.clear.lobit = and i64 %bf.clear, 1
+ store i64 %bf.clear.lobit, i64* %b, align 8
+ ret void
+}
+
+define i64 @fct2(i64 %cav1.coerce) nounwind {
+; CHECK-LABEL: fct2:
+; CHECK: sbfx x0, x0, #0, #36
+ %tmp = shl i64 %cav1.coerce, 28
+ %bf.val.sext = ashr exact i64 %tmp, 28
+ ret i64 %bf.val.sext
+}
+
+define i64 @fct3(i64 %cav1.coerce) nounwind {
+; CHECK-LABEL: fct3:
+; CHECK: sbfx x0, x0, #4, #38
+ %cav1.sroa.0.1.insert = shl i64 %cav1.coerce, 22
+ %tmp1 = ashr i64 %cav1.sroa.0.1.insert, 26
+ ret i64 %tmp1
+}
+
+define void @fct4(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct4:
+; CHECK: ldr [[REG1:x[0-9]+]],
+; CHECK-NEXT: bfxil [[REG1]], x1, #16, #24
+; CHECK-NEXT: str [[REG1]],
+; CHECK-NEXT: ret
+ %0 = load i64* %y, align 8
+ %and = and i64 %0, -16777216
+ %shr = lshr i64 %x, 16
+ %and1 = and i64 %shr, 16777215
+ %or = or i64 %and, %and1
+ store i64 %or, i64* %y, align 8
+ ret void
+}
+
+define void @fct5(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct5:
+; CHECK: ldr [[REG1:w[0-9]+]],
+; CHECK-NEXT: bfxil [[REG1]], w1, #16, #3
+; CHECK-NEXT: str [[REG1]],
+; CHECK-NEXT: ret
+ %0 = load i32* %y, align 8
+ %and = and i32 %0, -8
+ %shr = lshr i32 %x, 16
+ %and1 = and i32 %shr, 7
+ %or = or i32 %and, %and1
+ store i32 %or, i32* %y, align 8
+ ret void
+}
+
+; Check if we can still catch bfm instruction when we drop some low bits
+define void @fct6(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct6:
+; CHECK: ldr [[REG1:w[0-9]+]],
+; CHECK-NEXT: bfxil [[REG1]], w1, #16, #3
+; lsr is an alias of ubfm
+; CHECK-NEXT: lsr [[REG2:w[0-9]+]], [[REG1]], #2
+; CHECK-NEXT: str [[REG2]],
+; CHECK-NEXT: ret
+ %0 = load i32* %y, align 8
+ %and = and i32 %0, -8
+ %shr = lshr i32 %x, 16
+ %and1 = and i32 %shr, 7
+ %or = or i32 %and, %and1
+ %shr1 = lshr i32 %or, 2
+ store i32 %shr1, i32* %y, align 8
+ ret void
+}
+
+
+; Check if we can still catch bfm instruction when we drop some high bits
+define void @fct7(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct7:
+; CHECK: ldr [[REG1:w[0-9]+]],
+; CHECK-NEXT: bfxil [[REG1]], w1, #16, #3
+; lsl is an alias of ubfm
+; CHECK-NEXT: lsl [[REG2:w[0-9]+]], [[REG1]], #2
+; CHECK-NEXT: str [[REG2]],
+; CHECK-NEXT: ret
+ %0 = load i32* %y, align 8
+ %and = and i32 %0, -8
+ %shr = lshr i32 %x, 16
+ %and1 = and i32 %shr, 7
+ %or = or i32 %and, %and1
+ %shl = shl i32 %or, 2
+ store i32 %shl, i32* %y, align 8
+ ret void
+}
+
+
+; Check if we can still catch bfm instruction when we drop some low bits
+; (i64 version)
+define void @fct8(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct8:
+; CHECK: ldr [[REG1:x[0-9]+]],
+; CHECK-NEXT: bfxil [[REG1]], x1, #16, #3
+; lsr is an alias of ubfm
+; CHECK-NEXT: lsr [[REG2:x[0-9]+]], [[REG1]], #2
+; CHECK-NEXT: str [[REG2]],
+; CHECK-NEXT: ret
+ %0 = load i64* %y, align 8
+ %and = and i64 %0, -8
+ %shr = lshr i64 %x, 16
+ %and1 = and i64 %shr, 7
+ %or = or i64 %and, %and1
+ %shr1 = lshr i64 %or, 2
+ store i64 %shr1, i64* %y, align 8
+ ret void
+}
+
+
+; Check if we can still catch bfm instruction when we drop some high bits
+; (i64 version)
+define void @fct9(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct9:
+; CHECK: ldr [[REG1:x[0-9]+]],
+; CHECK-NEXT: bfxil [[REG1]], x1, #16, #3
+; lsr is an alias of ubfm
+; CHECK-NEXT: lsl [[REG2:x[0-9]+]], [[REG1]], #2
+; CHECK-NEXT: str [[REG2]],
+; CHECK-NEXT: ret
+ %0 = load i64* %y, align 8
+ %and = and i64 %0, -8
+ %shr = lshr i64 %x, 16
+ %and1 = and i64 %shr, 7
+ %or = or i64 %and, %and1
+ %shl = shl i64 %or, 2
+ store i64 %shl, i64* %y, align 8
+ ret void
+}
+
+; Check if we can catch bfm instruction when lsb is 0 (i.e., no lshr)
+; (i32 version)
+define void @fct10(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct10:
+; CHECK: ldr [[REG1:w[0-9]+]],
+; CHECK-NEXT: bfxil [[REG1]], w1, #0, #3
+; lsl is an alias of ubfm
+; CHECK-NEXT: lsl [[REG2:w[0-9]+]], [[REG1]], #2
+; CHECK-NEXT: str [[REG2]],
+; CHECK-NEXT: ret
+ %0 = load i32* %y, align 8
+ %and = and i32 %0, -8
+ %and1 = and i32 %x, 7
+ %or = or i32 %and, %and1
+ %shl = shl i32 %or, 2
+ store i32 %shl, i32* %y, align 8
+ ret void
+}
+
+; Check if we can catch bfm instruction when lsb is 0 (i.e., no lshr)
+; (i64 version)
+define void @fct11(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct11:
+; CHECK: ldr [[REG1:x[0-9]+]],
+; CHECK-NEXT: bfxil [[REG1]], x1, #0, #3
+; lsl is an alias of ubfm
+; CHECK-NEXT: lsl [[REG2:x[0-9]+]], [[REG1]], #2
+; CHECK-NEXT: str [[REG2]],
+; CHECK-NEXT: ret
+ %0 = load i64* %y, align 8
+ %and = and i64 %0, -8
+ %and1 = and i64 %x, 7
+ %or = or i64 %and, %and1
+ %shl = shl i64 %or, 2
+ store i64 %shl, i64* %y, align 8
+ ret void
+}
+
+define zeroext i1 @fct12bis(i32 %tmp2) unnamed_addr nounwind ssp align 2 {
+; CHECK-LABEL: fct12bis:
+; CHECK-NOT: and
+; CHECK: ubfx w0, w0, #11, #1
+ %and.i.i = and i32 %tmp2, 2048
+ %tobool.i.i = icmp ne i32 %and.i.i, 0
+ ret i1 %tobool.i.i
+}
+
+; Check if we can still catch bfm instruction when we drop some high bits
+; and some low bits
+define void @fct12(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct12:
+; CHECK: ldr [[REG1:w[0-9]+]],
+; CHECK-NEXT: bfxil [[REG1]], w1, #16, #3
+; lsr is an alias of ubfm
+; CHECK-NEXT: ubfx [[REG2:w[0-9]+]], [[REG1]], #2, #28
+; CHECK-NEXT: str [[REG2]],
+; CHECK-NEXT: ret
+ %0 = load i32* %y, align 8
+ %and = and i32 %0, -8
+ %shr = lshr i32 %x, 16
+ %and1 = and i32 %shr, 7
+ %or = or i32 %and, %and1
+ %shl = shl i32 %or, 2
+ %shr2 = lshr i32 %shl, 4
+ store i32 %shr2, i32* %y, align 8
+ ret void
+}
+
+; Check if we can still catch bfm instruction when we drop some high bits
+; and some low bits
+; (i64 version)
+define void @fct13(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct13:
+; CHECK: ldr [[REG1:x[0-9]+]],
+; CHECK-NEXT: bfxil [[REG1]], x1, #16, #3
+; lsr is an alias of ubfm
+; CHECK-NEXT: ubfx [[REG2:x[0-9]+]], [[REG1]], #2, #60
+; CHECK-NEXT: str [[REG2]],
+; CHECK-NEXT: ret
+ %0 = load i64* %y, align 8
+ %and = and i64 %0, -8
+ %shr = lshr i64 %x, 16
+ %and1 = and i64 %shr, 7
+ %or = or i64 %and, %and1
+ %shl = shl i64 %or, 2
+ %shr2 = lshr i64 %shl, 4
+ store i64 %shr2, i64* %y, align 8
+ ret void
+}
+
+
+; Check if we can still catch bfm instruction when we drop some high bits
+; and some low bits
+define void @fct14(i32* nocapture %y, i32 %x, i32 %x1) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct14:
+; CHECK: ldr [[REG1:w[0-9]+]],
+; CHECK-NEXT: bfxil [[REG1]], w1, #16, #8
+; lsr is an alias of ubfm
+; CHECK-NEXT: lsr [[REG2:w[0-9]+]], [[REG1]], #4
+; CHECK-NEXT: bfxil [[REG2]], w2, #5, #3
+; lsl is an alias of ubfm
+; CHECK-NEXT: lsl [[REG3:w[0-9]+]], [[REG2]], #2
+; CHECK-NEXT: str [[REG3]],
+; CHECK-NEXT: ret
+ %0 = load i32* %y, align 8
+ %and = and i32 %0, -256
+ %shr = lshr i32 %x, 16
+ %and1 = and i32 %shr, 255
+ %or = or i32 %and, %and1
+ %shl = lshr i32 %or, 4
+ %and2 = and i32 %shl, -8
+ %shr1 = lshr i32 %x1, 5
+ %and3 = and i32 %shr1, 7
+ %or1 = or i32 %and2, %and3
+ %shl1 = shl i32 %or1, 2
+ store i32 %shl1, i32* %y, align 8
+ ret void
+}
+
+; Check if we can still catch bfm instruction when we drop some high bits
+; and some low bits
+; (i64 version)
+define void @fct15(i64* nocapture %y, i64 %x, i64 %x1) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct15:
+; CHECK: ldr [[REG1:x[0-9]+]],
+; CHECK-NEXT: bfxil [[REG1]], x1, #16, #8
+; lsr is an alias of ubfm
+; CHECK-NEXT: lsr [[REG2:x[0-9]+]], [[REG1]], #4
+; CHECK-NEXT: bfxil [[REG2]], x2, #5, #3
+; lsl is an alias of ubfm
+; CHECK-NEXT: lsl [[REG3:x[0-9]+]], [[REG2]], #2
+; CHECK-NEXT: str [[REG3]],
+; CHECK-NEXT: ret
+ %0 = load i64* %y, align 8
+ %and = and i64 %0, -256
+ %shr = lshr i64 %x, 16
+ %and1 = and i64 %shr, 255
+ %or = or i64 %and, %and1
+ %shl = lshr i64 %or, 4
+ %and2 = and i64 %shl, -8
+ %shr1 = lshr i64 %x1, 5
+ %and3 = and i64 %shr1, 7
+ %or1 = or i64 %and2, %and3
+ %shl1 = shl i64 %or1, 2
+ store i64 %shl1, i64* %y, align 8
+ ret void
+}
+
+; Check if we can still catch bfm instruction when we drop some high bits
+; and some low bits and a masking operation has to be kept
+define void @fct16(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct16:
+; CHECK: ldr [[REG1:w[0-9]+]],
+; Create the constant
+; CHECK: movz [[REGCST:w[0-9]+]], #0x1a, lsl #16
+; CHECK: movk [[REGCST]], #0x8160
+; Do the masking
+; CHECK: and [[REG2:w[0-9]+]], [[REG1]], [[REGCST]]
+; CHECK-NEXT: bfxil [[REG2]], w1, #16, #3
+; lsr is an alias of ubfm
+; CHECK-NEXT: ubfx [[REG3:w[0-9]+]], [[REG2]], #2, #28
+; CHECK-NEXT: str [[REG3]],
+; CHECK-NEXT: ret
+ %0 = load i32* %y, align 8
+ %and = and i32 %0, 1737056
+ %shr = lshr i32 %x, 16
+ %and1 = and i32 %shr, 7
+ %or = or i32 %and, %and1
+ %shl = shl i32 %or, 2
+ %shr2 = lshr i32 %shl, 4
+ store i32 %shr2, i32* %y, align 8
+ ret void
+}
+
+
+; Check if we can still catch bfm instruction when we drop some high bits
+; and some low bits and a masking operation has to be kept
+; (i64 version)
+define void @fct17(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct17:
+; CHECK: ldr [[REG1:x[0-9]+]],
+; Create the constant
+; CHECK: movz w[[REGCST:[0-9]+]], #0x1a, lsl #16
+; CHECK: movk w[[REGCST]], #0x8160
+; Do the masking
+; CHECK: and [[REG2:x[0-9]+]], [[REG1]], x[[REGCST]]
+; CHECK-NEXT: bfxil [[REG2]], x1, #16, #3
+; lsr is an alias of ubfm
+; CHECK-NEXT: ubfx [[REG3:x[0-9]+]], [[REG2]], #2, #60
+; CHECK-NEXT: str [[REG3]],
+; CHECK-NEXT: ret
+ %0 = load i64* %y, align 8
+ %and = and i64 %0, 1737056
+ %shr = lshr i64 %x, 16
+ %and1 = and i64 %shr, 7
+ %or = or i64 %and, %and1
+ %shl = shl i64 %or, 2
+ %shr2 = lshr i64 %shl, 4
+ store i64 %shr2, i64* %y, align 8
+ ret void
+}
+
+define i64 @fct18(i32 %xor72) nounwind ssp {
+; CHECK-LABEL: fct18:
+; CHECK: ubfx x0, x0, #9, #8
+ %shr81 = lshr i32 %xor72, 9
+ %conv82 = zext i32 %shr81 to i64
+ %result = and i64 %conv82, 255
+ ret i64 %result
+}
+
+; Using the access to the global array to keep the instruction and control flow.
+@first_ones = external global [65536 x i8]
+
+; Function Attrs: nounwind readonly ssp
+define i32 @fct19(i64 %arg1) nounwind readonly ssp {
+; CHECK-LABEL: fct19:
+entry:
+ %x.sroa.1.0.extract.shift = lshr i64 %arg1, 16
+ %x.sroa.1.0.extract.trunc = trunc i64 %x.sroa.1.0.extract.shift to i16
+ %x.sroa.3.0.extract.shift = lshr i64 %arg1, 32
+ %x.sroa.5.0.extract.shift = lshr i64 %arg1, 48
+ %tobool = icmp eq i64 %x.sroa.5.0.extract.shift, 0
+ br i1 %tobool, label %if.end, label %if.then
+
+if.then: ; preds = %entry
+ %arrayidx3 = getelementptr inbounds [65536 x i8]* @first_ones, i64 0, i64 %x.sroa.5.0.extract.shift
+ %0 = load i8* %arrayidx3, align 1
+ %conv = zext i8 %0 to i32
+ br label %return
+
+; OPT-LABEL: if.end
+if.end: ; preds = %entry
+; OPT: lshr
+; CHECK: ubfx [[REG1:x[0-9]+]], [[REG2:x[0-9]+]], #32, #16
+ %x.sroa.3.0.extract.trunc = trunc i64 %x.sroa.3.0.extract.shift to i16
+ %tobool6 = icmp eq i16 %x.sroa.3.0.extract.trunc, 0
+; CHECK: cbz
+ br i1 %tobool6, label %if.end13, label %if.then7
+
+; OPT-LABEL: if.then7
+if.then7: ; preds = %if.end
+; OPT: lshr
+; "and" should be combined to "ubfm" while "ubfm" should be removed by cse.
+; So neither of them should be in the assemble code.
+; CHECK-NOT: and
+; CHECK-NOT: ubfm
+ %idxprom10 = and i64 %x.sroa.3.0.extract.shift, 65535
+ %arrayidx11 = getelementptr inbounds [65536 x i8]* @first_ones, i64 0, i64 %idxprom10
+ %1 = load i8* %arrayidx11, align 1
+ %conv12 = zext i8 %1 to i32
+ %add = add nsw i32 %conv12, 16
+ br label %return
+
+; OPT-LABEL: if.end13
+if.end13: ; preds = %if.end
+; OPT: lshr
+; OPT: trunc
+; CHECK: ubfx [[REG3:x[0-9]+]], [[REG4:x[0-9]+]], #16, #16
+ %tobool16 = icmp eq i16 %x.sroa.1.0.extract.trunc, 0
+; CHECK: cbz
+ br i1 %tobool16, label %return, label %if.then17
+
+; OPT-LABEL: if.then17
+if.then17: ; preds = %if.end13
+; OPT: lshr
+; "and" should be combined to "ubfm" while "ubfm" should be removed by cse.
+; So neither of them should be in the assemble code.
+; CHECK-NOT: and
+; CHECK-NOT: ubfm
+ %idxprom20 = and i64 %x.sroa.1.0.extract.shift, 65535
+ %arrayidx21 = getelementptr inbounds [65536 x i8]* @first_ones, i64 0, i64 %idxprom20
+ %2 = load i8* %arrayidx21, align 1
+ %conv22 = zext i8 %2 to i32
+ %add23 = add nsw i32 %conv22, 32
+ br label %return
+
+return: ; preds = %if.end13, %if.then17, %if.then7, %if.then
+; CHECK: ret
+ %retval.0 = phi i32 [ %conv, %if.then ], [ %add, %if.then7 ], [ %add23, %if.then17 ], [ 64, %if.end13 ]
+ ret i32 %retval.0
+}
+
+; Make sure we do not assert if the immediate in and is bigger than i64.
+; PR19503.
+; OPT-LABEL: @fct20
+; OPT: lshr
+; OPT-NOT: lshr
+; OPT: ret
+; CHECK-LABEL: fct20:
+; CHECK: ret
+define i80 @fct20(i128 %a, i128 %b) {
+entry:
+ %shr = lshr i128 %a, 18
+ %conv = trunc i128 %shr to i80
+ %tobool = icmp eq i128 %b, 0
+ br i1 %tobool, label %then, label %end
+then:
+ %and = and i128 %shr, 483673642326615442599424
+ %conv2 = trunc i128 %and to i80
+ br label %end
+end:
+ %conv3 = phi i80 [%conv, %entry], [%conv2, %then]
+ ret i80 %conv3
+}
+
+; Check if we can still catch UBFX when "AND" is used by SHL.
+; CHECK-LABEL: fct21:
+; CHECK: ubfx
+@arr = external global [8 x [64 x i64]]
+define i64 @fct21(i64 %x) {
+entry:
+ %shr = lshr i64 %x, 4
+ %and = and i64 %shr, 15
+ %arrayidx = getelementptr inbounds [8 x [64 x i64]]* @arr, i64 0, i64 0, i64 %and
+ %0 = load i64* %arrayidx, align 8
+ ret i64 %0
+}
+
+define i16 @test_ignored_rightbits(i32 %dst, i32 %in) {
+; CHECK-LABEL: test_ignored_rightbits:
+
+ %positioned_field = shl i32 %in, 3
+ %positioned_masked_field = and i32 %positioned_field, 120
+ %masked_dst = and i32 %dst, 7
+ %insertion = or i32 %masked_dst, %positioned_masked_field
+; CHECK: {{bfm|bfi|bfxil}}
+
+ %shl16 = shl i32 %insertion, 8
+ %or18 = or i32 %shl16, %insertion
+ %conv19 = trunc i32 %or18 to i16
+; CHECK: bfi {{w[0-9]+}}, {{w[0-9]+}}, #8, #7
+
+ ret i16 %conv19
+}
diff --git a/test/CodeGen/AArch64/arm64-blockaddress.ll b/test/CodeGen/AArch64/arm64-blockaddress.ll
new file mode 100644
index 0000000..ac4f19e
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-blockaddress.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s --check-prefix=CHECK-LINUX
+; RUN: llc < %s -mtriple=arm64-linux-gnu -code-model=large| FileCheck %s --check-prefix=CHECK-LARGE
+
+; rdar://9188695
+
+define i64 @t() nounwind ssp {
+entry:
+; CHECK-LABEL: t:
+; CHECK: adrp [[REG:x[0-9]+]], Ltmp1@PAGE
+; CHECK: add {{x[0-9]+}}, [[REG]], Ltmp1@PAGEOFF
+
+; CHECK-LINUX-LABEL: t:
+; CHECK-LINUX: adrp [[REG:x[0-9]+]], .Ltmp1
+; CHECK-LINUX: add {{x[0-9]+}}, [[REG]], :lo12:.Ltmp1
+
+; CHECK-LARGE-LABEL: t:
+; CHECK-LARGE: movz [[ADDR_REG:x[0-9]+]], #:abs_g3:[[DEST_LBL:.Ltmp[0-9]+]]
+; CHECK-LARGE: movk [[ADDR_REG]], #:abs_g2_nc:[[DEST_LBL]]
+; CHECK-LARGE: movk [[ADDR_REG]], #:abs_g1_nc:[[DEST_LBL]]
+; CHECK-LARGE: movk [[ADDR_REG]], #:abs_g0_nc:[[DEST_LBL]]
+
+ %recover = alloca i64, align 8
+ store volatile i64 ptrtoint (i8* blockaddress(@t, %mylabel) to i64), i64* %recover, align 8
+ br label %mylabel
+
+mylabel:
+ %tmp = load volatile i64* %recover, align 8
+ ret i64 %tmp
+}
diff --git a/test/CodeGen/AArch64/arm64-build-vector.ll b/test/CodeGen/AArch64/arm64-build-vector.ll
new file mode 100644
index 0000000..c109263
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-build-vector.ll
@@ -0,0 +1,35 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+; Check that building up a vector w/ only one non-zero lane initializes
+; intelligently.
+define void @one_lane(i32* nocapture %out_int, i32 %skip0) nounwind {
+; CHECK-LABEL: one_lane:
+; CHECK: dup.16b v[[REG:[0-9]+]], wzr
+; CHECK-NEXT: ins.b v[[REG]][0], w1
+; v and q are aliases, and str is preferred against st.16b when possible
+; rdar://11246289
+; CHECK: str q[[REG]], [x0]
+; CHECK: ret
+ %conv = trunc i32 %skip0 to i8
+ %vset_lane = insertelement <16 x i8> <i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, i8 %conv, i32 0
+ %tmp = bitcast i32* %out_int to <4 x i32>*
+ %tmp1 = bitcast <16 x i8> %vset_lane to <4 x i32>
+ store <4 x i32> %tmp1, <4 x i32>* %tmp, align 16
+ ret void
+}
+
+; Check that building a vector from floats doesn't insert an unnecessary
+; copy for lane zero.
+define <4 x float> @foo(float %a, float %b, float %c, float %d) nounwind {
+; CHECK-LABEL: foo:
+; CHECK-NOT: ins.s v0[0], v0[0]
+; CHECK: ins.s v0[1], v1[0]
+; CHECK: ins.s v0[2], v2[0]
+; CHECK: ins.s v0[3], v3[0]
+; CHECK: ret
+ %1 = insertelement <4 x float> undef, float %a, i32 0
+ %2 = insertelement <4 x float> %1, float %b, i32 1
+ %3 = insertelement <4 x float> %2, float %c, i32 2
+ %4 = insertelement <4 x float> %3, float %d, i32 3
+ ret <4 x float> %4
+}
diff --git a/test/CodeGen/AArch64/arm64-call-tailcalls.ll b/test/CodeGen/AArch64/arm64-call-tailcalls.ll
new file mode 100644
index 0000000..487c1d9
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-call-tailcalls.ll
@@ -0,0 +1,91 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
+
+@t = weak global i32 ()* null
+@x = external global i32, align 4
+
+define void @t2() {
+; CHECK-LABEL: t2:
+; CHECK: adrp x[[GOTADDR:[0-9]+]], _t@GOTPAGE
+; CHECK: ldr x[[ADDR:[0-9]+]], [x[[GOTADDR]], _t@GOTPAGEOFF]
+; CHECK: ldr x[[DEST:[0-9]+]], [x[[ADDR]]]
+; CHECK: br x[[DEST]]
+ %tmp = load i32 ()** @t
+ %tmp.upgrd.2 = tail call i32 %tmp()
+ ret void
+}
+
+define void @t3() {
+; CHECK-LABEL: t3:
+; CHECK: b _t2
+ tail call void @t2()
+ ret void
+}
+
+define double @t4(double %a) nounwind readonly ssp {
+; CHECK-LABEL: t4:
+; CHECK: b _sin
+ %tmp = tail call double @sin(double %a) nounwind readonly
+ ret double %tmp
+}
+
+define float @t5(float %a) nounwind readonly ssp {
+; CHECK-LABEL: t5:
+; CHECK: b _sinf
+ %tmp = tail call float @sinf(float %a) nounwind readonly
+ ret float %tmp
+}
+
+define void @t7() nounwind {
+; CHECK-LABEL: t7:
+; CHECK: b _foo
+; CHECK: b _bar
+
+ br i1 undef, label %bb, label %bb1.lr.ph
+
+bb1.lr.ph: ; preds = %entry
+ tail call void @bar() nounwind
+ ret void
+
+bb: ; preds = %entry
+ tail call void @foo() nounwind
+ ret void
+}
+
+define i32 @t8(i32 %x) nounwind ssp {
+; CHECK-LABEL: t8:
+; CHECK: b _a
+; CHECK: b _b
+; CHECK: b _c
+ %and = and i32 %x, 1
+ %tobool = icmp eq i32 %and, 0
+ br i1 %tobool, label %if.end, label %if.then
+
+if.then: ; preds = %entry
+ %call = tail call i32 @a(i32 %x) nounwind
+ br label %return
+
+if.end: ; preds = %entry
+ %and1 = and i32 %x, 2
+ %tobool2 = icmp eq i32 %and1, 0
+ br i1 %tobool2, label %if.end5, label %if.then3
+
+if.then3: ; preds = %if.end
+ %call4 = tail call i32 @b(i32 %x) nounwind
+ br label %return
+
+if.end5: ; preds = %if.end
+ %call6 = tail call i32 @c(i32 %x) nounwind
+ br label %return
+
+return: ; preds = %if.end5, %if.then3, %if.then
+ %retval.0 = phi i32 [ %call, %if.then ], [ %call4, %if.then3 ], [ %call6, %if.end5 ]
+ ret i32 %retval.0
+}
+
+declare float @sinf(float) nounwind readonly
+declare double @sin(double) nounwind readonly
+declare void @bar() nounwind
+declare void @foo() nounwind
+declare i32 @a(i32)
+declare i32 @b(i32)
+declare i32 @c(i32)
diff --git a/test/CodeGen/AArch64/arm64-cast-opt.ll b/test/CodeGen/AArch64/arm64-cast-opt.ll
new file mode 100644
index 0000000..65a871d
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-cast-opt.ll
@@ -0,0 +1,31 @@
+; RUN: llc -O3 -march=arm64 -mtriple arm64-apple-ios5.0.0 < %s | FileCheck %s
+; <rdar://problem/15992732>
+; Zero truncation is not necessary when the values are extended properly
+; already.
+
+@block = common global i8* null, align 8
+
+define zeroext i8 @foo(i32 %i1, i32 %i2) {
+; CHECK-LABEL: foo:
+; CHECK: cset
+; CHECK-NOT: and
+entry:
+ %idxprom = sext i32 %i1 to i64
+ %0 = load i8** @block, align 8
+ %arrayidx = getelementptr inbounds i8* %0, i64 %idxprom
+ %1 = load i8* %arrayidx, align 1
+ %idxprom1 = sext i32 %i2 to i64
+ %arrayidx2 = getelementptr inbounds i8* %0, i64 %idxprom1
+ %2 = load i8* %arrayidx2, align 1
+ %cmp = icmp eq i8 %1, %2
+ br i1 %cmp, label %return, label %if.then
+
+if.then: ; preds = %entry
+ %cmp7 = icmp ugt i8 %1, %2
+ %conv9 = zext i1 %cmp7 to i8
+ br label %return
+
+return: ; preds = %entry, %if.then
+ %retval.0 = phi i8 [ %conv9, %if.then ], [ 1, %entry ]
+ ret i8 %retval.0
+}
diff --git a/test/CodeGen/AArch64/arm64-ccmp-heuristics.ll b/test/CodeGen/AArch64/arm64-ccmp-heuristics.ll
new file mode 100644
index 0000000..664a26c
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-ccmp-heuristics.ll
@@ -0,0 +1,190 @@
+; RUN: llc < %s -mcpu=cyclone -verify-machineinstrs -aarch64-ccmp | FileCheck %s
+target triple = "arm64-apple-ios7.0.0"
+
+@channelColumns = external global i64
+@channelTracks = external global i64
+@mazeRoute = external hidden unnamed_addr global i8*, align 8
+@TOP = external global i64*
+@BOT = external global i64*
+@netsAssign = external global i64*
+
+; Function from yacr2/maze.c
+; The branch at the end of %if.then is driven by %cmp5 and %cmp6.
+; Isel converts the and i1 into two branches, and arm64-ccmp should not convert
+; it back again. %cmp6 has much higher latency than %cmp5.
+; CHECK: Maze1
+; CHECK: %if.then
+; CHECK: cmp x{{[0-9]+}}, #2
+; CHECK-NEXT b.cc
+; CHECK: %if.then
+; CHECK: cmp x{{[0-9]+}}, #2
+; CHECK-NEXT b.cc
+define i32 @Maze1() nounwind ssp {
+entry:
+ %0 = load i64* @channelColumns, align 8, !tbaa !0
+ %cmp90 = icmp eq i64 %0, 0
+ br i1 %cmp90, label %for.end, label %for.body
+
+for.body: ; preds = %for.inc, %entry
+ %1 = phi i64 [ %0, %entry ], [ %37, %for.inc ]
+ %i.092 = phi i64 [ 1, %entry ], [ %inc53, %for.inc ]
+ %numLeft.091 = phi i32 [ 0, %entry ], [ %numLeft.1, %for.inc ]
+ %2 = load i8** @mazeRoute, align 8, !tbaa !3
+ %arrayidx = getelementptr inbounds i8* %2, i64 %i.092
+ %3 = load i8* %arrayidx, align 1, !tbaa !1
+ %tobool = icmp eq i8 %3, 0
+ br i1 %tobool, label %for.inc, label %if.then
+
+if.then: ; preds = %for.body
+ %4 = load i64** @TOP, align 8, !tbaa !3
+ %arrayidx1 = getelementptr inbounds i64* %4, i64 %i.092
+ %5 = load i64* %arrayidx1, align 8, !tbaa !0
+ %6 = load i64** @netsAssign, align 8, !tbaa !3
+ %arrayidx2 = getelementptr inbounds i64* %6, i64 %5
+ %7 = load i64* %arrayidx2, align 8, !tbaa !0
+ %8 = load i64** @BOT, align 8, !tbaa !3
+ %arrayidx3 = getelementptr inbounds i64* %8, i64 %i.092
+ %9 = load i64* %arrayidx3, align 8, !tbaa !0
+ %arrayidx4 = getelementptr inbounds i64* %6, i64 %9
+ %10 = load i64* %arrayidx4, align 8, !tbaa !0
+ %cmp5 = icmp ugt i64 %i.092, 1
+ %cmp6 = icmp ugt i64 %10, 1
+ %or.cond = and i1 %cmp5, %cmp6
+ br i1 %or.cond, label %land.lhs.true7, label %if.else
+
+land.lhs.true7: ; preds = %if.then
+ %11 = load i64* @channelTracks, align 8, !tbaa !0
+ %add = add i64 %11, 1
+ %call = tail call fastcc i32 @Maze1Mech(i64 %i.092, i64 %add, i64 %10, i64 0, i64 %7, i32 -1, i32 -1)
+ %tobool8 = icmp eq i32 %call, 0
+ br i1 %tobool8, label %land.lhs.true7.if.else_crit_edge, label %if.then9
+
+land.lhs.true7.if.else_crit_edge: ; preds = %land.lhs.true7
+ %.pre = load i64* @channelColumns, align 8, !tbaa !0
+ br label %if.else
+
+if.then9: ; preds = %land.lhs.true7
+ %12 = load i8** @mazeRoute, align 8, !tbaa !3
+ %arrayidx10 = getelementptr inbounds i8* %12, i64 %i.092
+ store i8 0, i8* %arrayidx10, align 1, !tbaa !1
+ %13 = load i64** @TOP, align 8, !tbaa !3
+ %arrayidx11 = getelementptr inbounds i64* %13, i64 %i.092
+ %14 = load i64* %arrayidx11, align 8, !tbaa !0
+ tail call fastcc void @CleanNet(i64 %14)
+ %15 = load i64** @BOT, align 8, !tbaa !3
+ %arrayidx12 = getelementptr inbounds i64* %15, i64 %i.092
+ %16 = load i64* %arrayidx12, align 8, !tbaa !0
+ tail call fastcc void @CleanNet(i64 %16)
+ br label %for.inc
+
+if.else: ; preds = %land.lhs.true7.if.else_crit_edge, %if.then
+ %17 = phi i64 [ %.pre, %land.lhs.true7.if.else_crit_edge ], [ %1, %if.then ]
+ %cmp13 = icmp ult i64 %i.092, %17
+ %or.cond89 = and i1 %cmp13, %cmp6
+ br i1 %or.cond89, label %land.lhs.true16, label %if.else24
+
+land.lhs.true16: ; preds = %if.else
+ %18 = load i64* @channelTracks, align 8, !tbaa !0
+ %add17 = add i64 %18, 1
+ %call18 = tail call fastcc i32 @Maze1Mech(i64 %i.092, i64 %add17, i64 %10, i64 0, i64 %7, i32 1, i32 -1)
+ %tobool19 = icmp eq i32 %call18, 0
+ br i1 %tobool19, label %if.else24, label %if.then20
+
+if.then20: ; preds = %land.lhs.true16
+ %19 = load i8** @mazeRoute, align 8, !tbaa !3
+ %arrayidx21 = getelementptr inbounds i8* %19, i64 %i.092
+ store i8 0, i8* %arrayidx21, align 1, !tbaa !1
+ %20 = load i64** @TOP, align 8, !tbaa !3
+ %arrayidx22 = getelementptr inbounds i64* %20, i64 %i.092
+ %21 = load i64* %arrayidx22, align 8, !tbaa !0
+ tail call fastcc void @CleanNet(i64 %21)
+ %22 = load i64** @BOT, align 8, !tbaa !3
+ %arrayidx23 = getelementptr inbounds i64* %22, i64 %i.092
+ %23 = load i64* %arrayidx23, align 8, !tbaa !0
+ tail call fastcc void @CleanNet(i64 %23)
+ br label %for.inc
+
+if.else24: ; preds = %land.lhs.true16, %if.else
+ br i1 %cmp5, label %land.lhs.true26, label %if.else36
+
+land.lhs.true26: ; preds = %if.else24
+ %24 = load i64* @channelTracks, align 8, !tbaa !0
+ %cmp27 = icmp ult i64 %7, %24
+ br i1 %cmp27, label %land.lhs.true28, label %if.else36
+
+land.lhs.true28: ; preds = %land.lhs.true26
+ %add29 = add i64 %24, 1
+ %call30 = tail call fastcc i32 @Maze1Mech(i64 %i.092, i64 0, i64 %7, i64 %add29, i64 %10, i32 -1, i32 1)
+ %tobool31 = icmp eq i32 %call30, 0
+ br i1 %tobool31, label %if.else36, label %if.then32
+
+if.then32: ; preds = %land.lhs.true28
+ %25 = load i8** @mazeRoute, align 8, !tbaa !3
+ %arrayidx33 = getelementptr inbounds i8* %25, i64 %i.092
+ store i8 0, i8* %arrayidx33, align 1, !tbaa !1
+ %26 = load i64** @TOP, align 8, !tbaa !3
+ %arrayidx34 = getelementptr inbounds i64* %26, i64 %i.092
+ %27 = load i64* %arrayidx34, align 8, !tbaa !0
+ tail call fastcc void @CleanNet(i64 %27)
+ %28 = load i64** @BOT, align 8, !tbaa !3
+ %arrayidx35 = getelementptr inbounds i64* %28, i64 %i.092
+ %29 = load i64* %arrayidx35, align 8, !tbaa !0
+ tail call fastcc void @CleanNet(i64 %29)
+ br label %for.inc
+
+if.else36: ; preds = %land.lhs.true28, %land.lhs.true26, %if.else24
+ %30 = load i64* @channelColumns, align 8, !tbaa !0
+ %cmp37 = icmp ult i64 %i.092, %30
+ br i1 %cmp37, label %land.lhs.true38, label %if.else48
+
+land.lhs.true38: ; preds = %if.else36
+ %31 = load i64* @channelTracks, align 8, !tbaa !0
+ %cmp39 = icmp ult i64 %7, %31
+ br i1 %cmp39, label %land.lhs.true40, label %if.else48
+
+land.lhs.true40: ; preds = %land.lhs.true38
+ %add41 = add i64 %31, 1
+ %call42 = tail call fastcc i32 @Maze1Mech(i64 %i.092, i64 0, i64 %7, i64 %add41, i64 %10, i32 1, i32 1)
+ %tobool43 = icmp eq i32 %call42, 0
+ br i1 %tobool43, label %if.else48, label %if.then44
+
+if.then44: ; preds = %land.lhs.true40
+ %32 = load i8** @mazeRoute, align 8, !tbaa !3
+ %arrayidx45 = getelementptr inbounds i8* %32, i64 %i.092
+ store i8 0, i8* %arrayidx45, align 1, !tbaa !1
+ %33 = load i64** @TOP, align 8, !tbaa !3
+ %arrayidx46 = getelementptr inbounds i64* %33, i64 %i.092
+ %34 = load i64* %arrayidx46, align 8, !tbaa !0
+ tail call fastcc void @CleanNet(i64 %34)
+ %35 = load i64** @BOT, align 8, !tbaa !3
+ %arrayidx47 = getelementptr inbounds i64* %35, i64 %i.092
+ %36 = load i64* %arrayidx47, align 8, !tbaa !0
+ tail call fastcc void @CleanNet(i64 %36)
+ br label %for.inc
+
+if.else48: ; preds = %land.lhs.true40, %land.lhs.true38, %if.else36
+ %inc = add nsw i32 %numLeft.091, 1
+ br label %for.inc
+
+for.inc: ; preds = %if.else48, %if.then44, %if.then32, %if.then20, %if.then9, %for.body
+ %numLeft.1 = phi i32 [ %numLeft.091, %if.then9 ], [ %numLeft.091, %if.then20 ], [ %numLeft.091, %if.then32 ], [ %numLeft.091, %if.then44 ], [ %inc, %if.else48 ], [ %numLeft.091, %for.body ]
+ %inc53 = add i64 %i.092, 1
+ %37 = load i64* @channelColumns, align 8, !tbaa !0
+ %cmp = icmp ugt i64 %inc53, %37
+ br i1 %cmp, label %for.end, label %for.body
+
+for.end: ; preds = %for.inc, %entry
+ %numLeft.0.lcssa = phi i32 [ 0, %entry ], [ %numLeft.1, %for.inc ]
+ ret i32 %numLeft.0.lcssa
+}
+
+; Materializable
+declare hidden fastcc i32 @Maze1Mech(i64, i64, i64, i64, i64, i32, i32) nounwind ssp
+
+; Materializable
+declare hidden fastcc void @CleanNet(i64) nounwind ssp
+
+!0 = metadata !{metadata !"long", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!3 = metadata !{metadata !"any pointer", metadata !1}
diff --git a/test/CodeGen/AArch64/arm64-ccmp.ll b/test/CodeGen/AArch64/arm64-ccmp.ll
new file mode 100644
index 0000000..63965f9
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-ccmp.ll
@@ -0,0 +1,289 @@
+; RUN: llc < %s -mcpu=cyclone -verify-machineinstrs -aarch64-ccmp -aarch64-stress-ccmp | FileCheck %s
+target triple = "arm64-apple-ios"
+
+; CHECK: single_same
+; CHECK: cmp w0, #5
+; CHECK-NEXT: ccmp w1, #17, #4, ne
+; CHECK-NEXT: b.ne
+; CHECK: %if.then
+; CHECK: bl _foo
+; CHECK: %if.end
+define i32 @single_same(i32 %a, i32 %b) nounwind ssp {
+entry:
+ %cmp = icmp eq i32 %a, 5
+ %cmp1 = icmp eq i32 %b, 17
+ %or.cond = or i1 %cmp, %cmp1
+ br i1 %or.cond, label %if.then, label %if.end
+
+if.then:
+ %call = tail call i32 @foo() nounwind
+ br label %if.end
+
+if.end:
+ ret i32 7
+}
+
+; Different condition codes for the two compares.
+; CHECK: single_different
+; CHECK: cmp w0, #6
+; CHECK-NEXT: ccmp w1, #17, #0, ge
+; CHECK-NEXT: b.eq
+; CHECK: %if.then
+; CHECK: bl _foo
+; CHECK: %if.end
+define i32 @single_different(i32 %a, i32 %b) nounwind ssp {
+entry:
+ %cmp = icmp sle i32 %a, 5
+ %cmp1 = icmp ne i32 %b, 17
+ %or.cond = or i1 %cmp, %cmp1
+ br i1 %or.cond, label %if.then, label %if.end
+
+if.then:
+ %call = tail call i32 @foo() nounwind
+ br label %if.end
+
+if.end:
+ ret i32 7
+}
+
+; Second block clobbers the flags, can't convert (easily).
+; CHECK: single_flagclobber
+; CHECK: cmp
+; CHECK: b.eq
+; CHECK: cmp
+; CHECK: b.gt
+define i32 @single_flagclobber(i32 %a, i32 %b) nounwind ssp {
+entry:
+ %cmp = icmp eq i32 %a, 5
+ br i1 %cmp, label %if.then, label %lor.lhs.false
+
+lor.lhs.false: ; preds = %entry
+ %cmp1 = icmp slt i32 %b, 7
+ %mul = shl nsw i32 %b, 1
+ %add = add nsw i32 %b, 1
+ %cond = select i1 %cmp1, i32 %mul, i32 %add
+ %cmp2 = icmp slt i32 %cond, 17
+ br i1 %cmp2, label %if.then, label %if.end
+
+if.then: ; preds = %lor.lhs.false, %entry
+ %call = tail call i32 @foo() nounwind
+ br label %if.end
+
+if.end: ; preds = %if.then, %lor.lhs.false
+ ret i32 7
+}
+
+; Second block clobbers the flags and ends with a tbz terminator.
+; CHECK: single_flagclobber_tbz
+; CHECK: cmp
+; CHECK: b.eq
+; CHECK: cmp
+; CHECK: tbz
+define i32 @single_flagclobber_tbz(i32 %a, i32 %b) nounwind ssp {
+entry:
+ %cmp = icmp eq i32 %a, 5
+ br i1 %cmp, label %if.then, label %lor.lhs.false
+
+lor.lhs.false: ; preds = %entry
+ %cmp1 = icmp slt i32 %b, 7
+ %mul = shl nsw i32 %b, 1
+ %add = add nsw i32 %b, 1
+ %cond = select i1 %cmp1, i32 %mul, i32 %add
+ %and = and i32 %cond, 8
+ %cmp2 = icmp ne i32 %and, 0
+ br i1 %cmp2, label %if.then, label %if.end
+
+if.then: ; preds = %lor.lhs.false, %entry
+ %call = tail call i32 @foo() nounwind
+ br label %if.end
+
+if.end: ; preds = %if.then, %lor.lhs.false
+ ret i32 7
+}
+
+; Speculatively execute division by zero.
+; The sdiv/udiv instructions do not trap when the divisor is zero, so they are
+; safe to speculate.
+; CHECK: speculate_division
+; CHECK-NOT: cmp
+; CHECK: sdiv
+; CHECK: cmp
+; CHECK-NEXT: ccmp
+define i32 @speculate_division(i32 %a, i32 %b) nounwind ssp {
+entry:
+ %cmp = icmp sgt i32 %a, 0
+ br i1 %cmp, label %land.lhs.true, label %if.end
+
+land.lhs.true:
+ %div = sdiv i32 %b, %a
+ %cmp1 = icmp slt i32 %div, 17
+ br i1 %cmp1, label %if.then, label %if.end
+
+if.then:
+ %call = tail call i32 @foo() nounwind
+ br label %if.end
+
+if.end:
+ ret i32 7
+}
+
+; Floating point compare.
+; CHECK: single_fcmp
+; CHECK: cmp
+; CHECK-NOT: b.
+; CHECK: fccmp {{.*}}, #8, ge
+; CHECK: b.lt
+define i32 @single_fcmp(i32 %a, float %b) nounwind ssp {
+entry:
+ %cmp = icmp sgt i32 %a, 0
+ br i1 %cmp, label %land.lhs.true, label %if.end
+
+land.lhs.true:
+ %conv = sitofp i32 %a to float
+ %div = fdiv float %b, %conv
+ %cmp1 = fcmp oge float %div, 1.700000e+01
+ br i1 %cmp1, label %if.then, label %if.end
+
+if.then:
+ %call = tail call i32 @foo() nounwind
+ br label %if.end
+
+if.end:
+ ret i32 7
+}
+
+; Chain multiple compares.
+; CHECK: multi_different
+; CHECK: cmp
+; CHECK: ccmp
+; CHECK: ccmp
+; CHECK: b.
+define void @multi_different(i32 %a, i32 %b, i32 %c) nounwind ssp {
+entry:
+ %cmp = icmp sgt i32 %a, %b
+ br i1 %cmp, label %land.lhs.true, label %if.end
+
+land.lhs.true:
+ %div = sdiv i32 %b, %a
+ %cmp1 = icmp eq i32 %div, 5
+ %cmp4 = icmp sgt i32 %div, %c
+ %or.cond = and i1 %cmp1, %cmp4
+ br i1 %or.cond, label %if.then, label %if.end
+
+if.then:
+ %call = tail call i32 @foo() nounwind
+ br label %if.end
+
+if.end:
+ ret void
+}
+
+; Convert a cbz in the head block.
+; CHECK: cbz_head
+; CHECK: cmp w0, #0
+; CHECK: ccmp
+define i32 @cbz_head(i32 %a, i32 %b) nounwind ssp {
+entry:
+ %cmp = icmp eq i32 %a, 0
+ %cmp1 = icmp ne i32 %b, 17
+ %or.cond = or i1 %cmp, %cmp1
+ br i1 %or.cond, label %if.then, label %if.end
+
+if.then:
+ %call = tail call i32 @foo() nounwind
+ br label %if.end
+
+if.end:
+ ret i32 7
+}
+
+; Check that the immediate operand is in range. The ccmp instruction encodes a
+; smaller range of immediates than subs/adds.
+; The ccmp immediates must be in the range 0-31.
+; CHECK: immediate_range
+; CHECK-NOT: ccmp
+define i32 @immediate_range(i32 %a, i32 %b) nounwind ssp {
+entry:
+ %cmp = icmp eq i32 %a, 5
+ %cmp1 = icmp eq i32 %b, 32
+ %or.cond = or i1 %cmp, %cmp1
+ br i1 %or.cond, label %if.then, label %if.end
+
+if.then:
+ %call = tail call i32 @foo() nounwind
+ br label %if.end
+
+if.end:
+ ret i32 7
+}
+
+; Convert a cbz in the second block.
+; CHECK: cbz_second
+; CHECK: cmp w0, #0
+; CHECK: ccmp w1, #0, #0, ne
+; CHECK: b.eq
+define i32 @cbz_second(i32 %a, i32 %b) nounwind ssp {
+entry:
+ %cmp = icmp eq i32 %a, 0
+ %cmp1 = icmp ne i32 %b, 0
+ %or.cond = or i1 %cmp, %cmp1
+ br i1 %or.cond, label %if.then, label %if.end
+
+if.then:
+ %call = tail call i32 @foo() nounwind
+ br label %if.end
+
+if.end:
+ ret i32 7
+}
+
+; Convert a cbnz in the second block.
+; CHECK: cbnz_second
+; CHECK: cmp w0, #0
+; CHECK: ccmp w1, #0, #4, ne
+; CHECK: b.ne
+define i32 @cbnz_second(i32 %a, i32 %b) nounwind ssp {
+entry:
+ %cmp = icmp eq i32 %a, 0
+ %cmp1 = icmp eq i32 %b, 0
+ %or.cond = or i1 %cmp, %cmp1
+ br i1 %or.cond, label %if.then, label %if.end
+
+if.then:
+ %call = tail call i32 @foo() nounwind
+ br label %if.end
+
+if.end:
+ ret i32 7
+}
+declare i32 @foo()
+
+%str1 = type { %str2 }
+%str2 = type { [24 x i8], i8*, i32, %str1*, i32, [4 x i8], %str1*, %str1*, %str1*, %str1*, %str1*, %str1*, %str1*, %str1*, %str1*, i8*, i8, i8*, %str1*, i8* }
+
+; Test case distilled from 126.gcc.
+; The phi in sw.bb.i.i gets multiple operands for the %entry predecessor.
+; CHECK: build_modify_expr
+define void @build_modify_expr() nounwind ssp {
+entry:
+ switch i32 undef, label %sw.bb.i.i [
+ i32 69, label %if.end85
+ i32 70, label %if.end85
+ i32 71, label %if.end85
+ i32 72, label %if.end85
+ i32 73, label %if.end85
+ i32 105, label %if.end85
+ i32 106, label %if.end85
+ ]
+
+if.end85:
+ ret void
+
+sw.bb.i.i:
+ %ref.tr.i.i = phi %str1* [ %0, %sw.bb.i.i ], [ undef, %entry ]
+ %operands.i.i = getelementptr inbounds %str1* %ref.tr.i.i, i64 0, i32 0, i32 2
+ %arrayidx.i.i = bitcast i32* %operands.i.i to %str1**
+ %0 = load %str1** %arrayidx.i.i, align 8
+ %code1.i.i.phi.trans.insert = getelementptr inbounds %str1* %0, i64 0, i32 0, i32 0, i64 16
+ br label %sw.bb.i.i
+}
diff --git a/test/CodeGen/AArch64/arm64-clrsb.ll b/test/CodeGen/AArch64/arm64-clrsb.ll
new file mode 100644
index 0000000..042e52e
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-clrsb.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios7.0.0"
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.ctlz.i32(i32, i1) #0
+declare i64 @llvm.ctlz.i64(i64, i1) #1
+
+; Function Attrs: nounwind ssp
+define i32 @clrsb32(i32 %x) #2 {
+entry:
+ %shr = ashr i32 %x, 31
+ %xor = xor i32 %shr, %x
+ %mul = shl i32 %xor, 1
+ %add = or i32 %mul, 1
+ %0 = tail call i32 @llvm.ctlz.i32(i32 %add, i1 false)
+
+ ret i32 %0
+; CHECK-LABEL: clrsb32
+; CHECK: cls [[TEMP:w[0-9]+]], [[TEMP]]
+}
+
+; Function Attrs: nounwind ssp
+define i64 @clrsb64(i64 %x) #3 {
+entry:
+ %shr = ashr i64 %x, 63
+ %xor = xor i64 %shr, %x
+ %mul = shl nsw i64 %xor, 1
+ %add = or i64 %mul, 1
+ %0 = tail call i64 @llvm.ctlz.i64(i64 %add, i1 false)
+
+ ret i64 %0
+; CHECK-LABEL: clrsb64
+; CHECK: cls [[TEMP:x[0-9]+]], [[TEMP]]
+}
diff --git a/test/CodeGen/AArch64/arm64-coalesce-ext.ll b/test/CodeGen/AArch64/arm64-coalesce-ext.ll
new file mode 100644
index 0000000..9420bf3
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-coalesce-ext.ll
@@ -0,0 +1,17 @@
+; RUN: llc -march=arm64 -mtriple=arm64-apple-darwin < %s | FileCheck %s
+; Check that the peephole optimizer knows about sext and zext instructions.
+; CHECK: test1sext
+define i32 @test1sext(i64 %A, i64 %B, i32* %P, i64 *%P2) nounwind {
+ %C = add i64 %A, %B
+ ; CHECK: add x[[SUM:[0-9]+]], x0, x1
+ %D = trunc i64 %C to i32
+ %E = shl i64 %C, 32
+ %F = ashr i64 %E, 32
+ ; CHECK: sxtw x[[EXT:[0-9]+]], w[[SUM]]
+ store volatile i64 %F, i64 *%P2
+ ; CHECK: str x[[EXT]]
+ store volatile i32 %D, i32* %P
+ ; Reuse low bits of extended register, don't extend live range of SUM.
+ ; CHECK: str w[[SUM]]
+ ret i32 %D
+}
diff --git a/test/CodeGen/AArch64/arm64-code-model-large-abs.ll b/test/CodeGen/AArch64/arm64-code-model-large-abs.ll
new file mode 100644
index 0000000..264da2d
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-code-model-large-abs.ll
@@ -0,0 +1,72 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu -code-model=large < %s | FileCheck %s
+
+@var8 = global i8 0
+@var16 = global i16 0
+@var32 = global i32 0
+@var64 = global i64 0
+
+define i8* @global_addr() {
+; CHECK-LABEL: global_addr:
+ ret i8* @var8
+ ; The movz/movk calculation should end up returned directly in x0.
+; CHECK: movz x0, #:abs_g3:var8
+; CHECK: movk x0, #:abs_g2_nc:var8
+; CHECK: movk x0, #:abs_g1_nc:var8
+; CHECK: movk x0, #:abs_g0_nc:var8
+; CHECK-NEXT: ret
+}
+
+define i8 @global_i8() {
+; CHECK-LABEL: global_i8:
+ %val = load i8* @var8
+ ret i8 %val
+; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:var8
+; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var8
+; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:var8
+; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:var8
+; CHECK: ldrb w0, [x[[ADDR_REG]]]
+}
+
+define i16 @global_i16() {
+; CHECK-LABEL: global_i16:
+ %val = load i16* @var16
+ ret i16 %val
+; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:var16
+; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var16
+; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:var16
+; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:var16
+; CHECK: ldrh w0, [x[[ADDR_REG]]]
+}
+
+define i32 @global_i32() {
+; CHECK-LABEL: global_i32:
+ %val = load i32* @var32
+ ret i32 %val
+; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:var32
+; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var32
+; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:var32
+; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:var32
+; CHECK: ldr w0, [x[[ADDR_REG]]]
+}
+
+define i64 @global_i64() {
+; CHECK-LABEL: global_i64:
+ %val = load i64* @var64
+ ret i64 %val
+; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:var64
+; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var64
+; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:var64
+; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:var64
+; CHECK: ldr x0, [x[[ADDR_REG]]]
+}
+
+define <2 x i64> @constpool() {
+; CHECK-LABEL: constpool:
+ ret <2 x i64> <i64 123456789, i64 987654321100>
+
+; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:[[CPADDR:.LCPI[0-9]+_[0-9]+]]
+; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:[[CPADDR]]
+; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:[[CPADDR]]
+; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:[[CPADDR]]
+; CHECK: ldr q0, [x[[ADDR_REG]]]
+}
diff --git a/test/CodeGen/AArch64/arm64-collect-loh-garbage-crash.ll b/test/CodeGen/AArch64/arm64-collect-loh-garbage-crash.ll
new file mode 100644
index 0000000..81cee38
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-collect-loh-garbage-crash.ll
@@ -0,0 +1,37 @@
+; RUN: llc -mtriple=arm64-apple-ios -O3 -aarch64-collect-loh -aarch64-collect-loh-bb-only=true -aarch64-collect-loh-pre-collect-register=false < %s -o - | FileCheck %s
+; Check that the LOH analysis does not crash when the analysed chained
+; contains instructions that are filtered out.
+;
+; Before the fix for <rdar://problem/16041712>, these cases were removed
+; from the main container. Now, the deterministic container does not allow
+; to remove arbitrary values, so we have to live with garbage values.
+; <rdar://problem/16041712>
+
+%"class.H4ISP::H4ISPDevice" = type { i32 (%"class.H4ISP::H4ISPDevice"*, i32, i8*, i8*)*, i8*, i32*, %"class.H4ISP::H4ISPCameraManager"* }
+
+%"class.H4ISP::H4ISPCameraManager" = type opaque
+
+declare i32 @_ZN5H4ISP11H4ISPDevice32ISP_SelectBestMIPIFrequencyIndexEjPj(%"class.H4ISP::H4ISPDevice"*)
+
+@pH4ISPDevice = hidden global %"class.H4ISP::H4ISPDevice"* null, align 8
+
+; CHECK-LABEL: _foo:
+; CHECK: ret
+; CHECK-NOT: .loh AdrpLdrGotLdr
+define void @foo() {
+entry:
+ br label %if.then83
+if.then83: ; preds = %if.end81
+ %tmp = load %"class.H4ISP::H4ISPDevice"** @pH4ISPDevice, align 8
+ %call84 = call i32 @_ZN5H4ISP11H4ISPDevice32ISP_SelectBestMIPIFrequencyIndexEjPj(%"class.H4ISP::H4ISPDevice"* %tmp) #19
+ tail call void asm sideeffect "", "~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27}"()
+ %tmp2 = load %"class.H4ISP::H4ISPDevice"** @pH4ISPDevice, align 8
+ tail call void asm sideeffect "", "~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x28}"()
+ %pCameraManager.i268 = getelementptr inbounds %"class.H4ISP::H4ISPDevice"* %tmp2, i64 0, i32 3
+ %tmp3 = load %"class.H4ISP::H4ISPCameraManager"** %pCameraManager.i268, align 8
+ %tobool.i269 = icmp eq %"class.H4ISP::H4ISPCameraManager"* %tmp3, null
+ br i1 %tobool.i269, label %if.then83, label %end
+end:
+ ret void
+}
+
diff --git a/test/CodeGen/AArch64/arm64-collect-loh-str.ll b/test/CodeGen/AArch64/arm64-collect-loh-str.ll
new file mode 100644
index 0000000..d7bc00e
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-collect-loh-str.ll
@@ -0,0 +1,23 @@
+; RUN: llc -mtriple=arm64-apple-ios -O2 -aarch64-collect-loh -aarch64-collect-loh-bb-only=false < %s -o - | FileCheck %s
+; Test case for <rdar://problem/15942912>.
+; AdrpAddStr cannot be used when the store uses same
+; register as address and value. Indeed, the related
+; if applied, may completely remove the definition or
+; at least provide a wrong one (with the offset folded
+; into the definition).
+
+%struct.anon = type { i32*, i32** }
+
+@pptp_wan_head = internal global %struct.anon zeroinitializer, align 8
+
+; CHECK-LABEL: _pptp_wan_init
+; CHECK: ret
+; CHECK-NOT: AdrpAddStr
+define i32 @pptp_wan_init() {
+entry:
+ store i32* null, i32** getelementptr inbounds (%struct.anon* @pptp_wan_head, i64 0, i32 0), align 8
+ store i32** getelementptr inbounds (%struct.anon* @pptp_wan_head, i64 0, i32 0), i32*** getelementptr inbounds (%struct.anon* @pptp_wan_head, i64 0, i32 1), align 8
+ ret i32 0
+}
+
+
diff --git a/test/CodeGen/AArch64/arm64-collect-loh.ll b/test/CodeGen/AArch64/arm64-collect-loh.ll
new file mode 100644
index 0000000..6d73daa
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-collect-loh.ll
@@ -0,0 +1,53 @@
+; RUN: llc -mtriple=arm64-apple-ios -O2 -aarch64-collect-loh -aarch64-collect-loh-bb-only=false < %s -o - | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu -O2 -aarch64-collect-loh -aarch64-collect-loh-bb-only=false < %s -o - | FileCheck %s --check-prefix=CHECK-ELF
+
+; CHECK-ELF-NOT: .loh
+; CHECK-ELF-NOT: AdrpAdrp
+; CHECK-ELF-NOT: AdrpAdd
+; CHECK-ELF-NOT: AdrpLdrGot
+
+@a = internal unnamed_addr global i32 0, align 4
+@b = external global i32
+
+; Function Attrs: noinline nounwind ssp
+define void @foo(i32 %t) {
+entry:
+ %tmp = load i32* @a, align 4
+ %add = add nsw i32 %tmp, %t
+ store i32 %add, i32* @a, align 4
+ ret void
+}
+
+; Function Attrs: nounwind ssp
+; Testcase for <rdar://problem/15438605>, AdrpAdrp reuse is valid only when the first adrp
+; dominates the second.
+; The first adrp comes from the loading of 'a' and the second the loading of 'b'.
+; 'a' is loaded in if.then, 'b' in if.end4, if.then does not dominates if.end4.
+; CHECK-LABEL: _test
+; CHECK: ret
+; CHECK-NOT: .loh AdrpAdrp
+define i32 @test(i32 %t) {
+entry:
+ %cmp = icmp sgt i32 %t, 5
+ br i1 %cmp, label %if.then, label %if.end4
+
+if.then: ; preds = %entry
+ %tmp = load i32* @a, align 4
+ %add = add nsw i32 %tmp, %t
+ %cmp1 = icmp sgt i32 %add, 12
+ br i1 %cmp1, label %if.then2, label %if.end4
+
+if.then2: ; preds = %if.then
+ tail call void @foo(i32 %add)
+ %tmp1 = load i32* @a, align 4
+ br label %if.end4
+
+if.end4: ; preds = %if.then2, %if.then, %entry
+ %t.addr.0 = phi i32 [ %tmp1, %if.then2 ], [ %t, %if.then ], [ %t, %entry ]
+ %tmp2 = load i32* @b, align 4
+ %add5 = add nsw i32 %tmp2, %t.addr.0
+ tail call void @foo(i32 %add5)
+ %tmp3 = load i32* @b, align 4
+ %add6 = add nsw i32 %tmp3, %t.addr.0
+ ret i32 %add6
+}
diff --git a/test/CodeGen/AArch64/arm64-complex-copy-noneon.ll b/test/CodeGen/AArch64/arm64-complex-copy-noneon.ll
new file mode 100644
index 0000000..f65b116
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-complex-copy-noneon.ll
@@ -0,0 +1,21 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=-neon < %s
+
+; The DAG combiner decided to use a vector load/store for this struct copy
+; previously. This probably shouldn't happen without NEON, but the most
+; important thing is that it compiles.
+
+define void @store_combine() nounwind {
+ %src = alloca { double, double }, align 8
+ %dst = alloca { double, double }, align 8
+
+ %src.realp = getelementptr inbounds { double, double }* %src, i32 0, i32 0
+ %src.real = load double* %src.realp
+ %src.imagp = getelementptr inbounds { double, double }* %src, i32 0, i32 1
+ %src.imag = load double* %src.imagp
+
+ %dst.realp = getelementptr inbounds { double, double }* %dst, i32 0, i32 0
+ %dst.imagp = getelementptr inbounds { double, double }* %dst, i32 0, i32 1
+ store double %src.real, double* %dst.realp
+ store double %src.imag, double* %dst.imagp
+ ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-complex-ret.ll b/test/CodeGen/AArch64/arm64-complex-ret.ll
new file mode 100644
index 0000000..93d50a5
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-complex-ret.ll
@@ -0,0 +1,7 @@
+; RUN: llc -march=arm64 -o - %s | FileCheck %s
+
+define { i192, i192, i21, i192 } @foo(i192) {
+; CHECK-LABEL: foo:
+; CHECK: stp xzr, xzr, [x8]
+ ret { i192, i192, i21, i192 } {i192 0, i192 1, i21 2, i192 3}
+}
diff --git a/test/CodeGen/AArch64/arm64-const-addr.ll b/test/CodeGen/AArch64/arm64-const-addr.ll
new file mode 100644
index 0000000..c55a922
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-const-addr.ll
@@ -0,0 +1,23 @@
+; RUN: llc -mtriple=arm64-darwin-unknown < %s | FileCheck %s
+
+%T = type { i32, i32, i32, i32 }
+
+; Test if the constant base address gets only materialized once.
+define i32 @test1() nounwind {
+; CHECK-LABEL: test1
+; CHECK: movz w8, #0x40f, lsl #16
+; CHECK-NEXT: movk w8, #0xc000
+; CHECK-NEXT: ldp w9, w10, [x8, #4]
+; CHECK: ldr w8, [x8, #12]
+ %at = inttoptr i64 68141056 to %T*
+ %o1 = getelementptr %T* %at, i32 0, i32 1
+ %t1 = load i32* %o1
+ %o2 = getelementptr %T* %at, i32 0, i32 2
+ %t2 = load i32* %o2
+ %a1 = add i32 %t1, %t2
+ %o3 = getelementptr %T* %at, i32 0, i32 3
+ %t3 = load i32* %o3
+ %a2 = add i32 %a1, %t3
+ ret i32 %a2
+}
+
diff --git a/test/CodeGen/AArch64/arm64-convert-v2f64-v2i32.ll b/test/CodeGen/AArch64/arm64-convert-v2f64-v2i32.ll
new file mode 100644
index 0000000..d862b1e
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-convert-v2f64-v2i32.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+; CHECK: fptosi_1
+; CHECK: fcvtzs.2d
+; CHECK: xtn.2s
+; CHECK: ret
+define void @fptosi_1() nounwind noinline ssp {
+entry:
+ %0 = fptosi <2 x double> undef to <2 x i32>
+ store <2 x i32> %0, <2 x i32>* undef, align 8
+ ret void
+}
+
+; CHECK: fptoui_1
+; CHECK: fcvtzu.2d
+; CHECK: xtn.2s
+; CHECK: ret
+define void @fptoui_1() nounwind noinline ssp {
+entry:
+ %0 = fptoui <2 x double> undef to <2 x i32>
+ store <2 x i32> %0, <2 x i32>* undef, align 8
+ ret void
+}
+
diff --git a/test/CodeGen/AArch64/arm64-convert-v2i32-v2f64.ll b/test/CodeGen/AArch64/arm64-convert-v2i32-v2f64.ll
new file mode 100644
index 0000000..daaf1e0
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-convert-v2i32-v2f64.ll
@@ -0,0 +1,29 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <2 x double> @f1(<2 x i32> %v) nounwind readnone {
+; CHECK-LABEL: f1:
+; CHECK: sshll.2d v0, v0, #0
+; CHECK-NEXT: scvtf.2d v0, v0
+; CHECK-NEXT: ret
+ %conv = sitofp <2 x i32> %v to <2 x double>
+ ret <2 x double> %conv
+}
+define <2 x double> @f2(<2 x i32> %v) nounwind readnone {
+; CHECK-LABEL: f2:
+; CHECK: ushll.2d v0, v0, #0
+; CHECK-NEXT: ucvtf.2d v0, v0
+; CHECK-NEXT: ret
+ %conv = uitofp <2 x i32> %v to <2 x double>
+ ret <2 x double> %conv
+}
+
+; CHECK: autogen_SD19655
+; CHECK: scvtf
+; CHECK: ret
+define void @autogen_SD19655() {
+ %T = load <2 x i64>* undef
+ %F = sitofp <2 x i64> undef to <2 x float>
+ store <2 x float> %F, <2 x float>* undef
+ ret void
+}
+
diff --git a/test/CodeGen/AArch64/arm64-copy-tuple.ll b/test/CodeGen/AArch64/arm64-copy-tuple.ll
new file mode 100644
index 0000000..1803787
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-copy-tuple.ll
@@ -0,0 +1,146 @@
+; RUN: llc -mtriple=arm64-apple-ios -o - %s | FileCheck %s
+
+; The main purpose of this test is to find out whether copyPhysReg can deal with
+; the memmove-like situation arising in tuples, where an early copy can clobber
+; the value needed by a later one if the tuples overlap.
+
+; We use dummy inline asm to force LLVM to generate a COPY between the registers
+; we want by clobbering all the others.
+
+define void @test_D1D2_from_D0D1(i8* %addr) #0 {
+; CHECK-LABEL: test_D1D2_from_D0D1:
+; CHECK: mov.8b v2, v1
+; CHECK: mov.8b v1, v0
+entry:
+ %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
+ %vec = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
+ %vec0 = extractvalue { <8 x i8>, <8 x i8> } %vec, 0
+ %vec1 = extractvalue { <8 x i8>, <8 x i8> } %vec, 1
+ tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+ tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+
+ tail call void asm sideeffect "", "~{v0},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+ tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+ ret void
+}
+
+define void @test_D0D1_from_D1D2(i8* %addr) #0 {
+; CHECK-LABEL: test_D0D1_from_D1D2:
+; CHECK: mov.8b v0, v1
+; CHECK: mov.8b v1, v2
+entry:
+ %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
+ %vec = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
+ %vec0 = extractvalue { <8 x i8>, <8 x i8> } %vec, 0
+ %vec1 = extractvalue { <8 x i8>, <8 x i8> } %vec, 1
+ tail call void asm sideeffect "", "~{v0},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+ tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+
+ tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+ tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+ ret void
+}
+
+define void @test_D0D1_from_D31D0(i8* %addr) #0 {
+; CHECK-LABEL: test_D0D1_from_D31D0:
+; CHECK: mov.8b v1, v0
+; CHECK: mov.8b v0, v31
+entry:
+ %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
+ %vec = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
+ %vec0 = extractvalue { <8 x i8>, <8 x i8> } %vec, 0
+ %vec1 = extractvalue { <8 x i8>, <8 x i8> } %vec, 1
+ tail call void asm sideeffect "", "~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30}"()
+ tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+
+ tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+ tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+ ret void
+}
+
+define void @test_D31D0_from_D0D1(i8* %addr) #0 {
+; CHECK-LABEL: test_D31D0_from_D0D1:
+; CHECK: mov.8b v31, v0
+; CHECK: mov.8b v0, v1
+entry:
+ %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
+ %vec = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
+ %vec0 = extractvalue { <8 x i8>, <8 x i8> } %vec, 0
+ %vec1 = extractvalue { <8 x i8>, <8 x i8> } %vec, 1
+ tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+ tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+
+ tail call void asm sideeffect "", "~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30}"()
+ tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+ ret void
+}
+
+define void @test_D2D3D4_from_D0D1D2(i8* %addr) #0 {
+; CHECK-LABEL: test_D2D3D4_from_D0D1D2:
+; CHECK: mov.8b v4, v2
+; CHECK: mov.8b v3, v1
+; CHECK: mov.8b v2, v0
+entry:
+ %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
+ %vec = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
+ %vec0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vec, 0
+ %vec1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vec, 1
+ %vec2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vec, 2
+
+ tail call void asm sideeffect "", "~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+ tail call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, <8 x i8> %vec2, i8* %addr)
+
+ tail call void asm sideeffect "", "~{v0},~{v1},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+ tail call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, <8 x i8> %vec2, i8* %addr)
+ ret void
+}
+
+define void @test_Q0Q1Q2_from_Q1Q2Q3(i8* %addr) #0 {
+; CHECK-LABEL: test_Q0Q1Q2_from_Q1Q2Q3:
+; CHECK: mov.16b v0, v1
+; CHECK: mov.16b v1, v2
+; CHECK: mov.16b v2, v3
+entry:
+ %addr_v16i8 = bitcast i8* %addr to <16 x i8>*
+ %vec = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0v16i8(<16 x i8>* %addr_v16i8)
+ %vec0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vec, 0
+ %vec1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vec, 1
+ %vec2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vec, 2
+ tail call void asm sideeffect "", "~{v0},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+ tail call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, i8* %addr)
+
+ tail call void asm sideeffect "", "~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+ tail call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, i8* %addr)
+ ret void
+}
+
+define void @test_Q1Q2Q3Q4_from_Q30Q31Q0Q1(i8* %addr) #0 {
+; CHECK-LABEL: test_Q1Q2Q3Q4_from_Q30Q31Q0Q1:
+; CHECK: mov.16b v4, v1
+; CHECK: mov.16b v3, v0
+; CHECK: mov.16b v2, v31
+; CHECK: mov.16b v1, v30
+ %addr_v16i8 = bitcast i8* %addr to <16 x i8>*
+ %vec = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0v16i8(<16 x i8>* %addr_v16i8)
+ %vec0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vec, 0
+ %vec1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vec, 1
+ %vec2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vec, 2
+ %vec3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vec, 3
+
+ tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29}"()
+ tail call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, <16 x i8> %vec3, i8* %addr)
+
+ tail call void asm sideeffect "", "~{v0},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+ tail call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, <16 x i8> %vec3, i8* %addr)
+ ret void
+}
+
+declare { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>*)
+declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0v8i8(<8 x i8>*)
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0v16i8(<16 x i8>*)
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0v16i8(<16 x i8>*)
+
+declare void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*)
+declare void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*)
+declare void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*)
+declare void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*)
diff --git a/test/CodeGen/AArch64/arm64-crc32.ll b/test/CodeGen/AArch64/arm64-crc32.ll
new file mode 100644
index 0000000..d3099e6
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-crc32.ll
@@ -0,0 +1,71 @@
+; RUN: llc -march=arm64 -mattr=+crc -o - %s | FileCheck %s
+
+define i32 @test_crc32b(i32 %cur, i8 %next) {
+; CHECK-LABEL: test_crc32b:
+; CHECK: crc32b w0, w0, w1
+ %bits = zext i8 %next to i32
+ %val = call i32 @llvm.aarch64.crc32b(i32 %cur, i32 %bits)
+ ret i32 %val
+}
+
+define i32 @test_crc32h(i32 %cur, i16 %next) {
+; CHECK-LABEL: test_crc32h:
+; CHECK: crc32h w0, w0, w1
+ %bits = zext i16 %next to i32
+ %val = call i32 @llvm.aarch64.crc32h(i32 %cur, i32 %bits)
+ ret i32 %val
+}
+
+define i32 @test_crc32w(i32 %cur, i32 %next) {
+; CHECK-LABEL: test_crc32w:
+; CHECK: crc32w w0, w0, w1
+ %val = call i32 @llvm.aarch64.crc32w(i32 %cur, i32 %next)
+ ret i32 %val
+}
+
+define i32 @test_crc32x(i32 %cur, i64 %next) {
+; CHECK-LABEL: test_crc32x:
+; CHECK: crc32x w0, w0, x1
+ %val = call i32 @llvm.aarch64.crc32x(i32 %cur, i64 %next)
+ ret i32 %val
+}
+
+define i32 @test_crc32cb(i32 %cur, i8 %next) {
+; CHECK-LABEL: test_crc32cb:
+; CHECK: crc32cb w0, w0, w1
+ %bits = zext i8 %next to i32
+ %val = call i32 @llvm.aarch64.crc32cb(i32 %cur, i32 %bits)
+ ret i32 %val
+}
+
+define i32 @test_crc32ch(i32 %cur, i16 %next) {
+; CHECK-LABEL: test_crc32ch:
+; CHECK: crc32ch w0, w0, w1
+ %bits = zext i16 %next to i32
+ %val = call i32 @llvm.aarch64.crc32ch(i32 %cur, i32 %bits)
+ ret i32 %val
+}
+
+define i32 @test_crc32cw(i32 %cur, i32 %next) {
+; CHECK-LABEL: test_crc32cw:
+; CHECK: crc32cw w0, w0, w1
+ %val = call i32 @llvm.aarch64.crc32cw(i32 %cur, i32 %next)
+ ret i32 %val
+}
+
+define i32 @test_crc32cx(i32 %cur, i64 %next) {
+; CHECK-LABEL: test_crc32cx:
+; CHECK: crc32cx w0, w0, x1
+ %val = call i32 @llvm.aarch64.crc32cx(i32 %cur, i64 %next)
+ ret i32 %val
+}
+
+declare i32 @llvm.aarch64.crc32b(i32, i32)
+declare i32 @llvm.aarch64.crc32h(i32, i32)
+declare i32 @llvm.aarch64.crc32w(i32, i32)
+declare i32 @llvm.aarch64.crc32x(i32, i64)
+
+declare i32 @llvm.aarch64.crc32cb(i32, i32)
+declare i32 @llvm.aarch64.crc32ch(i32, i32)
+declare i32 @llvm.aarch64.crc32cw(i32, i32)
+declare i32 @llvm.aarch64.crc32cx(i32, i64)
diff --git a/test/CodeGen/AArch64/arm64-crypto.ll b/test/CodeGen/AArch64/arm64-crypto.ll
new file mode 100644
index 0000000..2908b33
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-crypto.ll
@@ -0,0 +1,135 @@
+; RUN: llc -march=arm64 -mattr=crypto -aarch64-neon-syntax=apple -o - %s | FileCheck %s
+
+declare <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %data, <16 x i8> %key)
+declare <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %data, <16 x i8> %key)
+declare <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %data)
+declare <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %data)
+
+define <16 x i8> @test_aese(<16 x i8> %data, <16 x i8> %key) {
+; CHECK-LABEL: test_aese:
+; CHECK: aese.16b v0, v1
+ %res = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %data, <16 x i8> %key)
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @test_aesd(<16 x i8> %data, <16 x i8> %key) {
+; CHECK-LABEL: test_aesd:
+; CHECK: aesd.16b v0, v1
+ %res = call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %data, <16 x i8> %key)
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @test_aesmc(<16 x i8> %data) {
+; CHECK-LABEL: test_aesmc:
+; CHECK: aesmc.16b v0, v0
+ %res = call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %data)
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @test_aesimc(<16 x i8> %data) {
+; CHECK-LABEL: test_aesimc:
+; CHECK: aesimc.16b v0, v0
+ %res = call <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %data)
+ ret <16 x i8> %res
+}
+
+declare <4 x i32> @llvm.aarch64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+declare <4 x i32> @llvm.aarch64.crypto.sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+declare <4 x i32> @llvm.aarch64.crypto.sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+declare i32 @llvm.aarch64.crypto.sha1h(i32 %hash_e)
+declare <4 x i32> @llvm.aarch64.crypto.sha1su0(<4 x i32> %wk0_3, <4 x i32> %wk4_7, <4 x i32> %wk8_11)
+declare <4 x i32> @llvm.aarch64.crypto.sha1su1(<4 x i32> %wk0_3, <4 x i32> %wk12_15)
+
+define <4 x i32> @test_sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
+; CHECK-LABEL: test_sha1c:
+; CHECK: fmov [[HASH_E:s[0-9]+]], w0
+; CHECK: sha1c.4s q0, [[HASH_E]], v1
+ %res = call <4 x i32> @llvm.aarch64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+ ret <4 x i32> %res
+}
+
+; <rdar://problem/14742333> Incomplete removal of unnecessary FMOV instructions in intrinsic SHA1
+define <4 x i32> @test_sha1c_in_a_row(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
+; CHECK-LABEL: test_sha1c_in_a_row:
+; CHECK: fmov [[HASH_E:s[0-9]+]], w0
+; CHECK: sha1c.4s q[[SHA1RES:[0-9]+]], [[HASH_E]], v1
+; CHECK-NOT: fmov
+; CHECK: sha1c.4s q0, s[[SHA1RES]], v1
+ %res = call <4 x i32> @llvm.aarch64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+ %extract = extractelement <4 x i32> %res, i32 0
+ %res2 = call <4 x i32> @llvm.aarch64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %extract, <4 x i32> %wk)
+ ret <4 x i32> %res2
+}
+
+define <4 x i32> @test_sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
+; CHECK-LABEL: test_sha1p:
+; CHECK: fmov [[HASH_E:s[0-9]+]], w0
+; CHECK: sha1p.4s q0, [[HASH_E]], v1
+ %res = call <4 x i32> @llvm.aarch64.crypto.sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
+; CHECK-LABEL: test_sha1m:
+; CHECK: fmov [[HASH_E:s[0-9]+]], w0
+; CHECK: sha1m.4s q0, [[HASH_E]], v1
+ %res = call <4 x i32> @llvm.aarch64.crypto.sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+ ret <4 x i32> %res
+}
+
+define i32 @test_sha1h(i32 %hash_e) {
+; CHECK-LABEL: test_sha1h:
+; CHECK: fmov [[HASH_E:s[0-9]+]], w0
+; CHECK: sha1h [[RES:s[0-9]+]], [[HASH_E]]
+; CHECK: fmov w0, [[RES]]
+ %res = call i32 @llvm.aarch64.crypto.sha1h(i32 %hash_e)
+ ret i32 %res
+}
+
+define <4 x i32> @test_sha1su0(<4 x i32> %wk0_3, <4 x i32> %wk4_7, <4 x i32> %wk8_11) {
+; CHECK-LABEL: test_sha1su0:
+; CHECK: sha1su0.4s v0, v1, v2
+ %res = call <4 x i32> @llvm.aarch64.crypto.sha1su0(<4 x i32> %wk0_3, <4 x i32> %wk4_7, <4 x i32> %wk8_11)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_sha1su1(<4 x i32> %wk0_3, <4 x i32> %wk12_15) {
+; CHECK-LABEL: test_sha1su1:
+; CHECK: sha1su1.4s v0, v1
+ %res = call <4 x i32> @llvm.aarch64.crypto.sha1su1(<4 x i32> %wk0_3, <4 x i32> %wk12_15)
+ ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.aarch64.crypto.sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk)
+declare <4 x i32> @llvm.aarch64.crypto.sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk)
+declare <4 x i32> @llvm.aarch64.crypto.sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7)
+declare <4 x i32> @llvm.aarch64.crypto.sha256su1(<4 x i32> %w0_3, <4 x i32> %w8_11, <4 x i32> %w12_15)
+
+define <4 x i32> @test_sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk) {
+; CHECK-LABEL: test_sha256h:
+; CHECK: sha256h.4s q0, q1, v2
+ %res = call <4 x i32> @llvm.aarch64.crypto.sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk) {
+; CHECK-LABEL: test_sha256h2:
+; CHECK: sha256h2.4s q0, q1, v2
+
+ %res = call <4 x i32> @llvm.aarch64.crypto.sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7) {
+; CHECK-LABEL: test_sha256su0:
+; CHECK: sha256su0.4s v0, v1
+ %res = call <4 x i32> @llvm.aarch64.crypto.sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7)
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_sha256su1(<4 x i32> %w0_3, <4 x i32> %w8_11, <4 x i32> %w12_15) {
+; CHECK-LABEL: test_sha256su1:
+; CHECK: sha256su1.4s v0, v1, v2
+ %res = call <4 x i32> @llvm.aarch64.crypto.sha256su1(<4 x i32> %w0_3, <4 x i32> %w8_11, <4 x i32> %w12_15)
+ ret <4 x i32> %res
+}
diff --git a/test/CodeGen/AArch64/arm64-cse.ll b/test/CodeGen/AArch64/arm64-cse.ll
new file mode 100644
index 0000000..bb14c89
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-cse.ll
@@ -0,0 +1,59 @@
+; RUN: llc -O3 < %s | FileCheck %s
+target triple = "arm64-apple-ios"
+
+; rdar://12462006
+; CSE between "icmp reg reg" and "sub reg reg".
+; Both can be in the same basic block or in different basic blocks.
+define i8* @t1(i8* %base, i32* nocapture %offset, i32 %size) nounwind {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: subs
+; CHECK-NOT: cmp
+; CHECK-NOT: sub
+; CHECK: b.ge
+; CHECK: sub
+; CHECK: sub
+; CHECK-NOT: sub
+; CHECK: ret
+ %0 = load i32* %offset, align 4
+ %cmp = icmp slt i32 %0, %size
+ %s = sub nsw i32 %0, %size
+ br i1 %cmp, label %return, label %if.end
+
+if.end:
+ %sub = sub nsw i32 %0, %size
+ %s2 = sub nsw i32 %s, %size
+ %s3 = sub nsw i32 %sub, %s2
+ store i32 %s3, i32* %offset, align 4
+ %add.ptr = getelementptr inbounds i8* %base, i32 %sub
+ br label %return
+
+return:
+ %retval.0 = phi i8* [ %add.ptr, %if.end ], [ null, %entry ]
+ ret i8* %retval.0
+}
+
+; CSE between "icmp reg imm" and "sub reg imm".
+define i8* @t2(i8* %base, i32* nocapture %offset) nounwind {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: subs
+; CHECK-NOT: cmp
+; CHECK-NOT: sub
+; CHECK: b.lt
+; CHECK-NOT: sub
+; CHECK: ret
+ %0 = load i32* %offset, align 4
+ %cmp = icmp slt i32 %0, 1
+ br i1 %cmp, label %return, label %if.end
+
+if.end:
+ %sub = sub nsw i32 %0, 1
+ store i32 %sub, i32* %offset, align 4
+ %add.ptr = getelementptr inbounds i8* %base, i32 %sub
+ br label %return
+
+return:
+ %retval.0 = phi i8* [ %add.ptr, %if.end ], [ null, %entry ]
+ ret i8* %retval.0
+}
diff --git a/test/CodeGen/AArch64/arm64-csel.ll b/test/CodeGen/AArch64/arm64-csel.ll
new file mode 100644
index 0000000..98eba30
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-csel.ll
@@ -0,0 +1,230 @@
+; RUN: llc -O3 < %s | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64"
+target triple = "arm64-unknown-unknown"
+
+; CHECK-LABEL: foo1
+; CHECK: cinc w{{[0-9]+}}, w{{[0-9]+}}, ne
+define i32 @foo1(i32 %b, i32 %c) nounwind readnone ssp {
+entry:
+ %not.tobool = icmp ne i32 %c, 0
+ %add = zext i1 %not.tobool to i32
+ %b.add = add i32 %c, %b
+ %add1 = add i32 %b.add, %add
+ ret i32 %add1
+}
+
+; CHECK-LABEL: foo2
+; CHECK: cneg w{{[0-9]+}}, w{{[0-9]+}}, ne
+define i32 @foo2(i32 %b, i32 %c) nounwind readnone ssp {
+entry:
+ %mul = sub i32 0, %b
+ %tobool = icmp eq i32 %c, 0
+ %b.mul = select i1 %tobool, i32 %b, i32 %mul
+ %add = add nsw i32 %b.mul, %c
+ ret i32 %add
+}
+
+; CHECK-LABEL: foo3
+; CHECK: cinv w{{[0-9]+}}, w{{[0-9]+}}, ne
+define i32 @foo3(i32 %b, i32 %c) nounwind readnone ssp {
+entry:
+ %not.tobool = icmp ne i32 %c, 0
+ %xor = sext i1 %not.tobool to i32
+ %b.xor = xor i32 %xor, %b
+ %add = add nsw i32 %b.xor, %c
+ ret i32 %add
+}
+
+; rdar://11632325
+define i32@foo4(i32 %a) nounwind ssp {
+; CHECK-LABEL: foo4
+; CHECK: cneg
+; CHECK-NEXT: ret
+ %cmp = icmp sgt i32 %a, -1
+ %neg = sub nsw i32 0, %a
+ %cond = select i1 %cmp, i32 %a, i32 %neg
+ ret i32 %cond
+}
+
+define i32@foo5(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK-LABEL: foo5
+; CHECK: subs
+; CHECK-NEXT: cneg
+; CHECK-NEXT: ret
+ %sub = sub nsw i32 %a, %b
+ %cmp = icmp sgt i32 %sub, -1
+ %sub3 = sub nsw i32 0, %sub
+ %cond = select i1 %cmp, i32 %sub, i32 %sub3
+ ret i32 %cond
+}
+
+; make sure we can handle branch instruction in optimizeCompare.
+define i32@foo6(i32 %a, i32 %b) nounwind ssp {
+; CHECK-LABEL: foo6
+; CHECK: b
+ %sub = sub nsw i32 %a, %b
+ %cmp = icmp sgt i32 %sub, 0
+ br i1 %cmp, label %l.if, label %l.else
+
+l.if:
+ ret i32 1
+
+l.else:
+ ret i32 %sub
+}
+
+; If CPSR is used multiple times and V flag is used, we don't remove cmp.
+define i32 @foo7(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK-LABEL: foo7:
+; CHECK: sub
+; CHECK-next: adds
+; CHECK-next: csneg
+; CHECK-next: b
+ %sub = sub nsw i32 %a, %b
+ %cmp = icmp sgt i32 %sub, -1
+ %sub3 = sub nsw i32 0, %sub
+ %cond = select i1 %cmp, i32 %sub, i32 %sub3
+ br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+ %cmp2 = icmp slt i32 %sub, -1
+ %sel = select i1 %cmp2, i32 %cond, i32 %a
+ ret i32 %sel
+
+if.else:
+ ret i32 %cond
+}
+
+define i32 @foo8(i32 %v, i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: foo8:
+; CHECK: cmp w0, #0
+; CHECK: csinv w0, w1, w2, ne
+ %tobool = icmp eq i32 %v, 0
+ %neg = xor i32 -1, %b
+ %cond = select i1 %tobool, i32 %neg, i32 %a
+ ret i32 %cond
+}
+
+define i32 @foo9(i32 %v) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo9:
+; CHECK: cmp w0, #0
+; CHECK: orr w[[REG:[0-9]+]], wzr, #0x4
+; CHECK: cinv w0, w[[REG]], eq
+ %tobool = icmp ne i32 %v, 0
+ %cond = select i1 %tobool, i32 4, i32 -5
+ ret i32 %cond
+}
+
+define i64 @foo10(i64 %v) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo10:
+; CHECK: cmp x0, #0
+; CHECK: orr w[[REG:[0-9]+]], wzr, #0x4
+; CHECK: cinv x0, x[[REG]], eq
+ %tobool = icmp ne i64 %v, 0
+ %cond = select i1 %tobool, i64 4, i64 -5
+ ret i64 %cond
+}
+
+define i32 @foo11(i32 %v) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo11:
+; CHECK: cmp w0, #0
+; CHECK: orr w[[REG:[0-9]+]], wzr, #0x4
+; CHECK: cneg w0, w[[REG]], eq
+ %tobool = icmp ne i32 %v, 0
+ %cond = select i1 %tobool, i32 4, i32 -4
+ ret i32 %cond
+}
+
+define i64 @foo12(i64 %v) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo12:
+; CHECK: cmp x0, #0
+; CHECK: orr w[[REG:[0-9]+]], wzr, #0x4
+; CHECK: cneg x0, x[[REG]], eq
+ %tobool = icmp ne i64 %v, 0
+ %cond = select i1 %tobool, i64 4, i64 -4
+ ret i64 %cond
+}
+
+define i32 @foo13(i32 %v, i32 %a, i32 %b) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo13:
+; CHECK: cmp w0, #0
+; CHECK: csneg w0, w1, w2, ne
+ %tobool = icmp eq i32 %v, 0
+ %sub = sub i32 0, %b
+ %cond = select i1 %tobool, i32 %sub, i32 %a
+ ret i32 %cond
+}
+
+define i64 @foo14(i64 %v, i64 %a, i64 %b) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo14:
+; CHECK: cmp x0, #0
+; CHECK: csneg x0, x1, x2, ne
+ %tobool = icmp eq i64 %v, 0
+ %sub = sub i64 0, %b
+ %cond = select i1 %tobool, i64 %sub, i64 %a
+ ret i64 %cond
+}
+
+define i32 @foo15(i32 %a, i32 %b) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo15:
+; CHECK: cmp w0, w1
+; CHECK: orr w[[REG:[0-9]+]], wzr, #0x1
+; CHECK: cinc w0, w[[REG]], gt
+ %cmp = icmp sgt i32 %a, %b
+ %. = select i1 %cmp, i32 2, i32 1
+ ret i32 %.
+}
+
+define i32 @foo16(i32 %a, i32 %b) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo16:
+; CHECK: cmp w0, w1
+; CHECK: orr w[[REG:[0-9]+]], wzr, #0x1
+; CHECK: cinc w0, w[[REG]], le
+ %cmp = icmp sgt i32 %a, %b
+ %. = select i1 %cmp, i32 1, i32 2
+ ret i32 %.
+}
+
+define i64 @foo17(i64 %a, i64 %b) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo17:
+; CHECK: cmp x0, x1
+; CHECK: orr w[[REG:[0-9]+]], wzr, #0x1
+; CHECK: cinc x0, x[[REG]], gt
+ %cmp = icmp sgt i64 %a, %b
+ %. = select i1 %cmp, i64 2, i64 1
+ ret i64 %.
+}
+
+define i64 @foo18(i64 %a, i64 %b) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo18:
+; CHECK: cmp x0, x1
+; CHECK: orr w[[REG:[0-9]+]], wzr, #0x1
+; CHECK: cinc x0, x[[REG]], le
+ %cmp = icmp sgt i64 %a, %b
+ %. = select i1 %cmp, i64 1, i64 2
+ ret i64 %.
+}
+
+define i64 @foo19(i64 %a, i64 %b, i64 %c) {
+entry:
+; CHECK-LABEL: foo19:
+; CHECK: cinc x0, x2
+; CHECK-NOT: add
+ %cmp = icmp ult i64 %a, %b
+ %inc = zext i1 %cmp to i64
+ %inc.c = add i64 %inc, %c
+ ret i64 %inc.c
+}
diff --git a/test/CodeGen/AArch64/arm64-cvt.ll b/test/CodeGen/AArch64/arm64-cvt.ll
new file mode 100644
index 0000000..420a8bc
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-cvt.ll
@@ -0,0 +1,401 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+;
+; Floating-point scalar convert to signed integer (to nearest with ties to away)
+;
+define i32 @fcvtas_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtas_1w1s:
+;CHECK: fcvtas w0, s0
+;CHECK-NEXT: ret
+ %tmp3 = call i32 @llvm.aarch64.neon.fcvtas.i32.f32(float %A)
+ ret i32 %tmp3
+}
+
+define i64 @fcvtas_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtas_1x1s:
+;CHECK: fcvtas x0, s0
+;CHECK-NEXT: ret
+ %tmp3 = call i64 @llvm.aarch64.neon.fcvtas.i64.f32(float %A)
+ ret i64 %tmp3
+}
+
+define i32 @fcvtas_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtas_1w1d:
+;CHECK: fcvtas w0, d0
+;CHECK-NEXT: ret
+ %tmp3 = call i32 @llvm.aarch64.neon.fcvtas.i32.f64(double %A)
+ ret i32 %tmp3
+}
+
+define i64 @fcvtas_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtas_1x1d:
+;CHECK: fcvtas x0, d0
+;CHECK-NEXT: ret
+ %tmp3 = call i64 @llvm.aarch64.neon.fcvtas.i64.f64(double %A)
+ ret i64 %tmp3
+}
+
+declare i32 @llvm.aarch64.neon.fcvtas.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtas.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtas.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtas.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to unsigned integer
+;
+define i32 @fcvtau_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtau_1w1s:
+;CHECK: fcvtau w0, s0
+;CHECK-NEXT: ret
+ %tmp3 = call i32 @llvm.aarch64.neon.fcvtau.i32.f32(float %A)
+ ret i32 %tmp3
+}
+
+define i64 @fcvtau_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtau_1x1s:
+;CHECK: fcvtau x0, s0
+;CHECK-NEXT: ret
+ %tmp3 = call i64 @llvm.aarch64.neon.fcvtau.i64.f32(float %A)
+ ret i64 %tmp3
+}
+
+define i32 @fcvtau_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtau_1w1d:
+;CHECK: fcvtau w0, d0
+;CHECK-NEXT: ret
+ %tmp3 = call i32 @llvm.aarch64.neon.fcvtau.i32.f64(double %A)
+ ret i32 %tmp3
+}
+
+define i64 @fcvtau_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtau_1x1d:
+;CHECK: fcvtau x0, d0
+;CHECK-NEXT: ret
+ %tmp3 = call i64 @llvm.aarch64.neon.fcvtau.i64.f64(double %A)
+ ret i64 %tmp3
+}
+
+declare i32 @llvm.aarch64.neon.fcvtau.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtau.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtau.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtau.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to signed integer (toward -Inf)
+;
+define i32 @fcvtms_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtms_1w1s:
+;CHECK: fcvtms w0, s0
+;CHECK-NEXT: ret
+ %tmp3 = call i32 @llvm.aarch64.neon.fcvtms.i32.f32(float %A)
+ ret i32 %tmp3
+}
+
+define i64 @fcvtms_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtms_1x1s:
+;CHECK: fcvtms x0, s0
+;CHECK-NEXT: ret
+ %tmp3 = call i64 @llvm.aarch64.neon.fcvtms.i64.f32(float %A)
+ ret i64 %tmp3
+}
+
+define i32 @fcvtms_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtms_1w1d:
+;CHECK: fcvtms w0, d0
+;CHECK-NEXT: ret
+ %tmp3 = call i32 @llvm.aarch64.neon.fcvtms.i32.f64(double %A)
+ ret i32 %tmp3
+}
+
+define i64 @fcvtms_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtms_1x1d:
+;CHECK: fcvtms x0, d0
+;CHECK-NEXT: ret
+ %tmp3 = call i64 @llvm.aarch64.neon.fcvtms.i64.f64(double %A)
+ ret i64 %tmp3
+}
+
+declare i32 @llvm.aarch64.neon.fcvtms.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtms.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtms.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtms.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to unsigned integer (toward -Inf)
+;
+define i32 @fcvtmu_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtmu_1w1s:
+;CHECK: fcvtmu w0, s0
+;CHECK-NEXT: ret
+ %tmp3 = call i32 @llvm.aarch64.neon.fcvtmu.i32.f32(float %A)
+ ret i32 %tmp3
+}
+
+define i64 @fcvtmu_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtmu_1x1s:
+;CHECK: fcvtmu x0, s0
+;CHECK-NEXT: ret
+ %tmp3 = call i64 @llvm.aarch64.neon.fcvtmu.i64.f32(float %A)
+ ret i64 %tmp3
+}
+
+define i32 @fcvtmu_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtmu_1w1d:
+;CHECK: fcvtmu w0, d0
+;CHECK-NEXT: ret
+ %tmp3 = call i32 @llvm.aarch64.neon.fcvtmu.i32.f64(double %A)
+ ret i32 %tmp3
+}
+
+define i64 @fcvtmu_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtmu_1x1d:
+;CHECK: fcvtmu x0, d0
+;CHECK-NEXT: ret
+ %tmp3 = call i64 @llvm.aarch64.neon.fcvtmu.i64.f64(double %A)
+ ret i64 %tmp3
+}
+
+declare i32 @llvm.aarch64.neon.fcvtmu.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtmu.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtmu.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtmu.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to signed integer (to nearest with ties to even)
+;
+define i32 @fcvtns_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtns_1w1s:
+;CHECK: fcvtns w0, s0
+;CHECK-NEXT: ret
+ %tmp3 = call i32 @llvm.aarch64.neon.fcvtns.i32.f32(float %A)
+ ret i32 %tmp3
+}
+
+define i64 @fcvtns_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtns_1x1s:
+;CHECK: fcvtns x0, s0
+;CHECK-NEXT: ret
+ %tmp3 = call i64 @llvm.aarch64.neon.fcvtns.i64.f32(float %A)
+ ret i64 %tmp3
+}
+
+define i32 @fcvtns_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtns_1w1d:
+;CHECK: fcvtns w0, d0
+;CHECK-NEXT: ret
+ %tmp3 = call i32 @llvm.aarch64.neon.fcvtns.i32.f64(double %A)
+ ret i32 %tmp3
+}
+
+define i64 @fcvtns_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtns_1x1d:
+;CHECK: fcvtns x0, d0
+;CHECK-NEXT: ret
+ %tmp3 = call i64 @llvm.aarch64.neon.fcvtns.i64.f64(double %A)
+ ret i64 %tmp3
+}
+
+declare i32 @llvm.aarch64.neon.fcvtns.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtns.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtns.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtns.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to unsigned integer (to nearest with ties to even)
+;
+define i32 @fcvtnu_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtnu_1w1s:
+;CHECK: fcvtnu w0, s0
+;CHECK-NEXT: ret
+ %tmp3 = call i32 @llvm.aarch64.neon.fcvtnu.i32.f32(float %A)
+ ret i32 %tmp3
+}
+
+define i64 @fcvtnu_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtnu_1x1s:
+;CHECK: fcvtnu x0, s0
+;CHECK-NEXT: ret
+ %tmp3 = call i64 @llvm.aarch64.neon.fcvtnu.i64.f32(float %A)
+ ret i64 %tmp3
+}
+
+define i32 @fcvtnu_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtnu_1w1d:
+;CHECK: fcvtnu w0, d0
+;CHECK-NEXT: ret
+ %tmp3 = call i32 @llvm.aarch64.neon.fcvtnu.i32.f64(double %A)
+ ret i32 %tmp3
+}
+
+define i64 @fcvtnu_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtnu_1x1d:
+;CHECK: fcvtnu x0, d0
+;CHECK-NEXT: ret
+ %tmp3 = call i64 @llvm.aarch64.neon.fcvtnu.i64.f64(double %A)
+ ret i64 %tmp3
+}
+
+declare i32 @llvm.aarch64.neon.fcvtnu.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtnu.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtnu.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtnu.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to signed integer (toward +Inf)
+;
+define i32 @fcvtps_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtps_1w1s:
+;CHECK: fcvtps w0, s0
+;CHECK-NEXT: ret
+ %tmp3 = call i32 @llvm.aarch64.neon.fcvtps.i32.f32(float %A)
+ ret i32 %tmp3
+}
+
+define i64 @fcvtps_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtps_1x1s:
+;CHECK: fcvtps x0, s0
+;CHECK-NEXT: ret
+ %tmp3 = call i64 @llvm.aarch64.neon.fcvtps.i64.f32(float %A)
+ ret i64 %tmp3
+}
+
+define i32 @fcvtps_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtps_1w1d:
+;CHECK: fcvtps w0, d0
+;CHECK-NEXT: ret
+ %tmp3 = call i32 @llvm.aarch64.neon.fcvtps.i32.f64(double %A)
+ ret i32 %tmp3
+}
+
+define i64 @fcvtps_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtps_1x1d:
+;CHECK: fcvtps x0, d0
+;CHECK-NEXT: ret
+ %tmp3 = call i64 @llvm.aarch64.neon.fcvtps.i64.f64(double %A)
+ ret i64 %tmp3
+}
+
+declare i32 @llvm.aarch64.neon.fcvtps.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtps.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtps.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtps.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to unsigned integer (toward +Inf)
+;
+define i32 @fcvtpu_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtpu_1w1s:
+;CHECK: fcvtpu w0, s0
+;CHECK-NEXT: ret
+ %tmp3 = call i32 @llvm.aarch64.neon.fcvtpu.i32.f32(float %A)
+ ret i32 %tmp3
+}
+
+define i64 @fcvtpu_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtpu_1x1s:
+;CHECK: fcvtpu x0, s0
+;CHECK-NEXT: ret
+ %tmp3 = call i64 @llvm.aarch64.neon.fcvtpu.i64.f32(float %A)
+ ret i64 %tmp3
+}
+
+define i32 @fcvtpu_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtpu_1w1d:
+;CHECK: fcvtpu w0, d0
+;CHECK-NEXT: ret
+ %tmp3 = call i32 @llvm.aarch64.neon.fcvtpu.i32.f64(double %A)
+ ret i32 %tmp3
+}
+
+define i64 @fcvtpu_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtpu_1x1d:
+;CHECK: fcvtpu x0, d0
+;CHECK-NEXT: ret
+ %tmp3 = call i64 @llvm.aarch64.neon.fcvtpu.i64.f64(double %A)
+ ret i64 %tmp3
+}
+
+declare i32 @llvm.aarch64.neon.fcvtpu.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtpu.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtpu.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtpu.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to signed integer (toward zero)
+;
+define i32 @fcvtzs_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtzs_1w1s:
+;CHECK: fcvtzs w0, s0
+;CHECK-NEXT: ret
+ %tmp3 = call i32 @llvm.aarch64.neon.fcvtzs.i32.f32(float %A)
+ ret i32 %tmp3
+}
+
+define i64 @fcvtzs_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtzs_1x1s:
+;CHECK: fcvtzs x0, s0
+;CHECK-NEXT: ret
+ %tmp3 = call i64 @llvm.aarch64.neon.fcvtzs.i64.f32(float %A)
+ ret i64 %tmp3
+}
+
+define i32 @fcvtzs_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtzs_1w1d:
+;CHECK: fcvtzs w0, d0
+;CHECK-NEXT: ret
+ %tmp3 = call i32 @llvm.aarch64.neon.fcvtzs.i32.f64(double %A)
+ ret i32 %tmp3
+}
+
+define i64 @fcvtzs_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtzs_1x1d:
+;CHECK: fcvtzs x0, d0
+;CHECK-NEXT: ret
+ %tmp3 = call i64 @llvm.aarch64.neon.fcvtzs.i64.f64(double %A)
+ ret i64 %tmp3
+}
+
+declare i32 @llvm.aarch64.neon.fcvtzs.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtzs.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtzs.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtzs.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to unsigned integer (toward zero)
+;
+define i32 @fcvtzu_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtzu_1w1s:
+;CHECK: fcvtzu w0, s0
+;CHECK-NEXT: ret
+ %tmp3 = call i32 @llvm.aarch64.neon.fcvtzu.i32.f32(float %A)
+ ret i32 %tmp3
+}
+
+define i64 @fcvtzu_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtzu_1x1s:
+;CHECK: fcvtzu x0, s0
+;CHECK-NEXT: ret
+ %tmp3 = call i64 @llvm.aarch64.neon.fcvtzu.i64.f32(float %A)
+ ret i64 %tmp3
+}
+
+define i32 @fcvtzu_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtzu_1w1d:
+;CHECK: fcvtzu w0, d0
+;CHECK-NEXT: ret
+ %tmp3 = call i32 @llvm.aarch64.neon.fcvtzu.i32.f64(double %A)
+ ret i32 %tmp3
+}
+
+define i64 @fcvtzu_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtzu_1x1d:
+;CHECK: fcvtzu x0, d0
+;CHECK-NEXT: ret
+ %tmp3 = call i64 @llvm.aarch64.neon.fcvtzu.i64.f64(double %A)
+ ret i64 %tmp3
+}
+
+declare i32 @llvm.aarch64.neon.fcvtzu.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtzu.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtzu.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtzu.i64.f64(double) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-dagcombiner-convergence.ll b/test/CodeGen/AArch64/arm64-dagcombiner-convergence.ll
new file mode 100644
index 0000000..a45e313
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-dagcombiner-convergence.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -o /dev/null
+; rdar://10795250
+; DAGCombiner should converge.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64"
+target triple = "arm64-apple-macosx10.8.0"
+
+define i64 @foo(i128 %Params.coerce, i128 %SelLocs.coerce) {
+entry:
+ %tmp = lshr i128 %Params.coerce, 61
+ %.tr38.i = trunc i128 %tmp to i64
+ %mul.i = and i64 %.tr38.i, 4294967288
+ %tmp1 = lshr i128 %SelLocs.coerce, 62
+ %.tr.i = trunc i128 %tmp1 to i64
+ %mul7.i = and i64 %.tr.i, 4294967292
+ %add.i = add i64 %mul7.i, %mul.i
+ %conv.i.i = and i64 %add.i, 4294967292
+ ret i64 %conv.i.i
+}
diff --git a/test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll b/test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll
new file mode 100644
index 0000000..2cf0135
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll
@@ -0,0 +1,29 @@
+; RUN: llc -mcpu=cyclone < %s | FileCheck %s
+
+target datalayout = "e-i64:64-n32:64-S128"
+target triple = "arm64-apple-ios"
+
+%"struct.SU" = type { i32, %"struct.SU"*, i32*, i32, i32, %"struct.BO", i32, [5 x i8] }
+%"struct.BO" = type { %"struct.RE" }
+
+%"struct.RE" = type { i32, i32, i32, i32 }
+
+; This is a read-modify-write of some bifields combined into an i48. It gets
+; legalized into i32 and i16 accesses. Only a single store of zero to the low
+; i32 part should be live.
+
+; CHECK-LABEL: test:
+; CHECK-NOT: ldr
+; CHECK: str wzr
+; CHECK-NOT: str
+define void @test(%"struct.SU"* nocapture %su) {
+entry:
+ %r1 = getelementptr inbounds %"struct.SU"* %su, i64 1, i32 5
+ %r2 = bitcast %"struct.BO"* %r1 to i48*
+ %r3 = load i48* %r2, align 8
+ %r4 = and i48 %r3, -4294967296
+ %r5 = or i48 0, %r4
+ store i48 %r5, i48* %r2, align 8
+
+ ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-dagcombiner-indexed-load.ll b/test/CodeGen/AArch64/arm64-dagcombiner-indexed-load.ll
new file mode 100644
index 0000000..2e4b658
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-dagcombiner-indexed-load.ll
@@ -0,0 +1,46 @@
+; RUN: llc -O3 < %s | FileCheck %s
+; RUN: llc -O3 -addr-sink-using-gep=1 < %s | FileCheck %s
+; Test case for a DAG combiner bug where we combined an indexed load
+; with an extension (sext, zext, or any) into a regular extended load,
+; i.e., dropping the indexed value.
+; <rdar://problem/16389332>
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios"
+
+%class.A = type { i64, i64 }
+%class.C = type { i64 }
+
+; CHECK-LABEL: XX:
+; CHECK: ldr
+define void @XX(%class.A* %K) {
+entry:
+ br i1 undef, label %if.then, label %lor.rhs.i
+
+lor.rhs.i: ; preds = %entry
+ %tmp = load i32* undef, align 4
+ %y.i.i.i = getelementptr inbounds %class.A* %K, i64 0, i32 1
+ %tmp1 = load i64* %y.i.i.i, align 8
+ %U.sroa.3.8.extract.trunc.i = trunc i64 %tmp1 to i32
+ %div11.i = sdiv i32 %U.sroa.3.8.extract.trunc.i, 17
+ %add12.i = add nsw i32 0, %div11.i
+ %U.sroa.3.12.extract.shift.i = lshr i64 %tmp1, 32
+ %U.sroa.3.12.extract.trunc.i = trunc i64 %U.sroa.3.12.extract.shift.i to i32
+ %div15.i = sdiv i32 %U.sroa.3.12.extract.trunc.i, 13
+ %add16.i = add nsw i32 %add12.i, %div15.i
+ %rem.i.i = srem i32 %add16.i, %tmp
+ %idxprom = sext i32 %rem.i.i to i64
+ %arrayidx = getelementptr inbounds %class.C** undef, i64 %idxprom
+ %tobool533 = icmp eq %class.C* undef, null
+ br i1 %tobool533, label %while.end, label %while.body
+
+if.then: ; preds = %entry
+ unreachable
+
+while.body: ; preds = %lor.rhs.i
+ unreachable
+
+while.end: ; preds = %lor.rhs.i
+ %tmp3 = load %class.C** %arrayidx, align 8
+ unreachable
+}
diff --git a/test/CodeGen/AArch64/arm64-dagcombiner-load-slicing.ll b/test/CodeGen/AArch64/arm64-dagcombiner-load-slicing.ll
new file mode 100644
index 0000000..0679014
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-dagcombiner-load-slicing.ll
@@ -0,0 +1,102 @@
+; RUN: llc -mtriple arm64-apple-ios -O3 -o - < %s | FileCheck %s
+; <rdar://problem/14477220>
+
+%class.Complex = type { float, float }
+%class.Complex_int = type { i32, i32 }
+%class.Complex_long = type { i64, i64 }
+
+; CHECK-LABEL: @test
+; CHECK: add [[BASE:x[0-9]+]], x0, x1, lsl #3
+; CHECK: ldp [[CPLX1_I:s[0-9]+]], [[CPLX1_R:s[0-9]+]], {{\[}}[[BASE]]]
+; CHECK: ldp [[CPLX2_I:s[0-9]+]], [[CPLX2_R:s[0-9]+]], {{\[}}[[BASE]], #64]
+; CHECK: fadd {{s[0-9]+}}, [[CPLX2_I]], [[CPLX1_I]]
+; CHECK: fadd {{s[0-9]+}}, [[CPLX2_R]], [[CPLX1_R]]
+; CHECK: ret
+define void @test(%class.Complex* nocapture %out, i64 %out_start) {
+entry:
+ %arrayidx = getelementptr inbounds %class.Complex* %out, i64 %out_start
+ %0 = bitcast %class.Complex* %arrayidx to i64*
+ %1 = load i64* %0, align 4
+ %t0.sroa.0.0.extract.trunc = trunc i64 %1 to i32
+ %2 = bitcast i32 %t0.sroa.0.0.extract.trunc to float
+ %t0.sroa.2.0.extract.shift = lshr i64 %1, 32
+ %t0.sroa.2.0.extract.trunc = trunc i64 %t0.sroa.2.0.extract.shift to i32
+ %3 = bitcast i32 %t0.sroa.2.0.extract.trunc to float
+ %add = add i64 %out_start, 8
+ %arrayidx2 = getelementptr inbounds %class.Complex* %out, i64 %add
+ %i.i = getelementptr inbounds %class.Complex* %arrayidx2, i64 0, i32 0
+ %4 = load float* %i.i, align 4
+ %add.i = fadd float %4, %2
+ %retval.sroa.0.0.vec.insert.i = insertelement <2 x float> undef, float %add.i, i32 0
+ %r.i = getelementptr inbounds %class.Complex* %arrayidx2, i64 0, i32 1
+ %5 = load float* %r.i, align 4
+ %add5.i = fadd float %5, %3
+ %retval.sroa.0.4.vec.insert.i = insertelement <2 x float> %retval.sroa.0.0.vec.insert.i, float %add5.i, i32 1
+ %ref.tmp.sroa.0.0.cast = bitcast %class.Complex* %arrayidx to <2 x float>*
+ store <2 x float> %retval.sroa.0.4.vec.insert.i, <2 x float>* %ref.tmp.sroa.0.0.cast, align 4
+ ret void
+}
+
+; CHECK-LABEL: @test_int
+; CHECK: add [[BASE:x[0-9]+]], x0, x1, lsl #3
+; CHECK: ldp [[CPLX1_I:w[0-9]+]], [[CPLX1_R:w[0-9]+]], {{\[}}[[BASE]]]
+; CHECK: ldp [[CPLX2_I:w[0-9]+]], [[CPLX2_R:w[0-9]+]], {{\[}}[[BASE]], #64]
+; CHECK: add {{w[0-9]+}}, [[CPLX2_I]], [[CPLX1_I]]
+; CHECK: add {{w[0-9]+}}, [[CPLX2_R]], [[CPLX1_R]]
+; CHECK: ret
+define void @test_int(%class.Complex_int* nocapture %out, i64 %out_start) {
+entry:
+ %arrayidx = getelementptr inbounds %class.Complex_int* %out, i64 %out_start
+ %0 = bitcast %class.Complex_int* %arrayidx to i64*
+ %1 = load i64* %0, align 4
+ %t0.sroa.0.0.extract.trunc = trunc i64 %1 to i32
+ %2 = bitcast i32 %t0.sroa.0.0.extract.trunc to i32
+ %t0.sroa.2.0.extract.shift = lshr i64 %1, 32
+ %t0.sroa.2.0.extract.trunc = trunc i64 %t0.sroa.2.0.extract.shift to i32
+ %3 = bitcast i32 %t0.sroa.2.0.extract.trunc to i32
+ %add = add i64 %out_start, 8
+ %arrayidx2 = getelementptr inbounds %class.Complex_int* %out, i64 %add
+ %i.i = getelementptr inbounds %class.Complex_int* %arrayidx2, i64 0, i32 0
+ %4 = load i32* %i.i, align 4
+ %add.i = add i32 %4, %2
+ %retval.sroa.0.0.vec.insert.i = insertelement <2 x i32> undef, i32 %add.i, i32 0
+ %r.i = getelementptr inbounds %class.Complex_int* %arrayidx2, i64 0, i32 1
+ %5 = load i32* %r.i, align 4
+ %add5.i = add i32 %5, %3
+ %retval.sroa.0.4.vec.insert.i = insertelement <2 x i32> %retval.sroa.0.0.vec.insert.i, i32 %add5.i, i32 1
+ %ref.tmp.sroa.0.0.cast = bitcast %class.Complex_int* %arrayidx to <2 x i32>*
+ store <2 x i32> %retval.sroa.0.4.vec.insert.i, <2 x i32>* %ref.tmp.sroa.0.0.cast, align 4
+ ret void
+}
+
+; CHECK-LABEL: @test_long
+; CHECK: add [[BASE:x[0-9]+]], x0, x1, lsl #4
+; CHECK: ldp [[CPLX1_I:x[0-9]+]], [[CPLX1_R:x[0-9]+]], {{\[}}[[BASE]]]
+; CHECK: ldp [[CPLX2_I:x[0-9]+]], [[CPLX2_R:x[0-9]+]], {{\[}}[[BASE]], #128]
+; CHECK: add {{x[0-9]+}}, [[CPLX2_I]], [[CPLX1_I]]
+; CHECK: add {{x[0-9]+}}, [[CPLX2_R]], [[CPLX1_R]]
+; CHECK: ret
+define void @test_long(%class.Complex_long* nocapture %out, i64 %out_start) {
+entry:
+ %arrayidx = getelementptr inbounds %class.Complex_long* %out, i64 %out_start
+ %0 = bitcast %class.Complex_long* %arrayidx to i128*
+ %1 = load i128* %0, align 4
+ %t0.sroa.0.0.extract.trunc = trunc i128 %1 to i64
+ %2 = bitcast i64 %t0.sroa.0.0.extract.trunc to i64
+ %t0.sroa.2.0.extract.shift = lshr i128 %1, 64
+ %t0.sroa.2.0.extract.trunc = trunc i128 %t0.sroa.2.0.extract.shift to i64
+ %3 = bitcast i64 %t0.sroa.2.0.extract.trunc to i64
+ %add = add i64 %out_start, 8
+ %arrayidx2 = getelementptr inbounds %class.Complex_long* %out, i64 %add
+ %i.i = getelementptr inbounds %class.Complex_long* %arrayidx2, i32 0, i32 0
+ %4 = load i64* %i.i, align 4
+ %add.i = add i64 %4, %2
+ %retval.sroa.0.0.vec.insert.i = insertelement <2 x i64> undef, i64 %add.i, i32 0
+ %r.i = getelementptr inbounds %class.Complex_long* %arrayidx2, i32 0, i32 1
+ %5 = load i64* %r.i, align 4
+ %add5.i = add i64 %5, %3
+ %retval.sroa.0.4.vec.insert.i = insertelement <2 x i64> %retval.sroa.0.0.vec.insert.i, i64 %add5.i, i32 1
+ %ref.tmp.sroa.0.0.cast = bitcast %class.Complex_long* %arrayidx to <2 x i64>*
+ store <2 x i64> %retval.sroa.0.4.vec.insert.i, <2 x i64>* %ref.tmp.sroa.0.0.cast, align 4
+ ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-dead-def-frame-index.ll b/test/CodeGen/AArch64/arm64-dead-def-frame-index.ll
new file mode 100644
index 0000000..9bb4b71
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-dead-def-frame-index.ll
@@ -0,0 +1,18 @@
+; RUN: llc -march=arm64 < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios7.0.0"
+
+; Function Attrs: nounwind ssp uwtable
+define i32 @test1() #0 {
+ %tmp1 = alloca i8
+ %tmp2 = alloca i32, i32 4096
+ %tmp3 = icmp eq i8* %tmp1, null
+ %tmp4 = zext i1 %tmp3 to i32
+
+ ret i32 %tmp4
+
+ ; CHECK-LABEL: test1
+ ; CHECK: adds [[TEMP:[a-z0-9]+]], sp, #4, lsl #12
+ ; CHECK: adds [[TEMP]], [[TEMP]], #15
+}
diff --git a/test/CodeGen/AArch64/arm64-dead-register-def-bug.ll b/test/CodeGen/AArch64/arm64-dead-register-def-bug.ll
new file mode 100644
index 0000000..1bbcf50
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-dead-register-def-bug.ll
@@ -0,0 +1,32 @@
+; RUN: llc -mtriple="arm64-apple-ios" < %s | FileCheck %s
+;
+; Check that the dead register definition pass is considering implicit defs.
+; When rematerializing through truncates, the coalescer may produce instructions
+; with dead defs, but live implicit-defs of subregs:
+; E.g. %X1<def, dead> = MOVi64imm 2, %W1<imp-def>; %X1:GPR64, %W1:GPR32
+; These instructions are live, and their definitions should not be rewritten.
+;
+; <rdar://problem/16492408>
+
+define void @testcase() {
+; CHECK: testcase:
+; CHECK-NOT: orr xzr, xzr, #0x2
+
+bb1:
+ %tmp1 = tail call float @ceilf(float 2.000000e+00)
+ %tmp2 = fptoui float %tmp1 to i64
+ br i1 undef, label %bb2, label %bb3
+
+bb2:
+ tail call void @foo()
+ br label %bb3
+
+bb3:
+ %tmp3 = trunc i64 %tmp2 to i32
+ tail call void @bar(i32 %tmp3)
+ ret void
+}
+
+declare void @foo()
+declare void @bar(i32)
+declare float @ceilf(float) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-dup.ll b/test/CodeGen/AArch64/arm64-dup.ll
new file mode 100644
index 0000000..0c56b46
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-dup.ll
@@ -0,0 +1,323 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+
+define <8 x i8> @v_dup8(i8 %A) nounwind {
+;CHECK-LABEL: v_dup8:
+;CHECK: dup.8b
+ %tmp1 = insertelement <8 x i8> zeroinitializer, i8 %A, i32 0
+ %tmp2 = insertelement <8 x i8> %tmp1, i8 %A, i32 1
+ %tmp3 = insertelement <8 x i8> %tmp2, i8 %A, i32 2
+ %tmp4 = insertelement <8 x i8> %tmp3, i8 %A, i32 3
+ %tmp5 = insertelement <8 x i8> %tmp4, i8 %A, i32 4
+ %tmp6 = insertelement <8 x i8> %tmp5, i8 %A, i32 5
+ %tmp7 = insertelement <8 x i8> %tmp6, i8 %A, i32 6
+ %tmp8 = insertelement <8 x i8> %tmp7, i8 %A, i32 7
+ ret <8 x i8> %tmp8
+}
+
+define <4 x i16> @v_dup16(i16 %A) nounwind {
+;CHECK-LABEL: v_dup16:
+;CHECK: dup.4h
+ %tmp1 = insertelement <4 x i16> zeroinitializer, i16 %A, i32 0
+ %tmp2 = insertelement <4 x i16> %tmp1, i16 %A, i32 1
+ %tmp3 = insertelement <4 x i16> %tmp2, i16 %A, i32 2
+ %tmp4 = insertelement <4 x i16> %tmp3, i16 %A, i32 3
+ ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @v_dup32(i32 %A) nounwind {
+;CHECK-LABEL: v_dup32:
+;CHECK: dup.2s
+ %tmp1 = insertelement <2 x i32> zeroinitializer, i32 %A, i32 0
+ %tmp2 = insertelement <2 x i32> %tmp1, i32 %A, i32 1
+ ret <2 x i32> %tmp2
+}
+
+define <2 x float> @v_dupfloat(float %A) nounwind {
+;CHECK-LABEL: v_dupfloat:
+;CHECK: dup.2s
+ %tmp1 = insertelement <2 x float> zeroinitializer, float %A, i32 0
+ %tmp2 = insertelement <2 x float> %tmp1, float %A, i32 1
+ ret <2 x float> %tmp2
+}
+
+define <16 x i8> @v_dupQ8(i8 %A) nounwind {
+;CHECK-LABEL: v_dupQ8:
+;CHECK: dup.16b
+ %tmp1 = insertelement <16 x i8> zeroinitializer, i8 %A, i32 0
+ %tmp2 = insertelement <16 x i8> %tmp1, i8 %A, i32 1
+ %tmp3 = insertelement <16 x i8> %tmp2, i8 %A, i32 2
+ %tmp4 = insertelement <16 x i8> %tmp3, i8 %A, i32 3
+ %tmp5 = insertelement <16 x i8> %tmp4, i8 %A, i32 4
+ %tmp6 = insertelement <16 x i8> %tmp5, i8 %A, i32 5
+ %tmp7 = insertelement <16 x i8> %tmp6, i8 %A, i32 6
+ %tmp8 = insertelement <16 x i8> %tmp7, i8 %A, i32 7
+ %tmp9 = insertelement <16 x i8> %tmp8, i8 %A, i32 8
+ %tmp10 = insertelement <16 x i8> %tmp9, i8 %A, i32 9
+ %tmp11 = insertelement <16 x i8> %tmp10, i8 %A, i32 10
+ %tmp12 = insertelement <16 x i8> %tmp11, i8 %A, i32 11
+ %tmp13 = insertelement <16 x i8> %tmp12, i8 %A, i32 12
+ %tmp14 = insertelement <16 x i8> %tmp13, i8 %A, i32 13
+ %tmp15 = insertelement <16 x i8> %tmp14, i8 %A, i32 14
+ %tmp16 = insertelement <16 x i8> %tmp15, i8 %A, i32 15
+ ret <16 x i8> %tmp16
+}
+
+define <8 x i16> @v_dupQ16(i16 %A) nounwind {
+;CHECK-LABEL: v_dupQ16:
+;CHECK: dup.8h
+ %tmp1 = insertelement <8 x i16> zeroinitializer, i16 %A, i32 0
+ %tmp2 = insertelement <8 x i16> %tmp1, i16 %A, i32 1
+ %tmp3 = insertelement <8 x i16> %tmp2, i16 %A, i32 2
+ %tmp4 = insertelement <8 x i16> %tmp3, i16 %A, i32 3
+ %tmp5 = insertelement <8 x i16> %tmp4, i16 %A, i32 4
+ %tmp6 = insertelement <8 x i16> %tmp5, i16 %A, i32 5
+ %tmp7 = insertelement <8 x i16> %tmp6, i16 %A, i32 6
+ %tmp8 = insertelement <8 x i16> %tmp7, i16 %A, i32 7
+ ret <8 x i16> %tmp8
+}
+
+define <4 x i32> @v_dupQ32(i32 %A) nounwind {
+;CHECK-LABEL: v_dupQ32:
+;CHECK: dup.4s
+ %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %A, i32 0
+ %tmp2 = insertelement <4 x i32> %tmp1, i32 %A, i32 1
+ %tmp3 = insertelement <4 x i32> %tmp2, i32 %A, i32 2
+ %tmp4 = insertelement <4 x i32> %tmp3, i32 %A, i32 3
+ ret <4 x i32> %tmp4
+}
+
+define <4 x float> @v_dupQfloat(float %A) nounwind {
+;CHECK-LABEL: v_dupQfloat:
+;CHECK: dup.4s
+ %tmp1 = insertelement <4 x float> zeroinitializer, float %A, i32 0
+ %tmp2 = insertelement <4 x float> %tmp1, float %A, i32 1
+ %tmp3 = insertelement <4 x float> %tmp2, float %A, i32 2
+ %tmp4 = insertelement <4 x float> %tmp3, float %A, i32 3
+ ret <4 x float> %tmp4
+}
+
+; Check to make sure it works with shuffles, too.
+
+define <8 x i8> @v_shuffledup8(i8 %A) nounwind {
+;CHECK-LABEL: v_shuffledup8:
+;CHECK: dup.8b
+ %tmp1 = insertelement <8 x i8> undef, i8 %A, i32 0
+ %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
+ ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @v_shuffledup16(i16 %A) nounwind {
+;CHECK-LABEL: v_shuffledup16:
+;CHECK: dup.4h
+ %tmp1 = insertelement <4 x i16> undef, i16 %A, i32 0
+ %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
+ ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @v_shuffledup32(i32 %A) nounwind {
+;CHECK-LABEL: v_shuffledup32:
+;CHECK: dup.2s
+ %tmp1 = insertelement <2 x i32> undef, i32 %A, i32 0
+ %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
+ ret <2 x i32> %tmp2
+}
+
+define <2 x float> @v_shuffledupfloat(float %A) nounwind {
+;CHECK-LABEL: v_shuffledupfloat:
+;CHECK: dup.2s
+ %tmp1 = insertelement <2 x float> undef, float %A, i32 0
+ %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer
+ ret <2 x float> %tmp2
+}
+
+define <16 x i8> @v_shuffledupQ8(i8 %A) nounwind {
+;CHECK-LABEL: v_shuffledupQ8:
+;CHECK: dup.16b
+ %tmp1 = insertelement <16 x i8> undef, i8 %A, i32 0
+ %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer
+ ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @v_shuffledupQ16(i16 %A) nounwind {
+;CHECK-LABEL: v_shuffledupQ16:
+;CHECK: dup.8h
+ %tmp1 = insertelement <8 x i16> undef, i16 %A, i32 0
+ %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer
+ ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @v_shuffledupQ32(i32 %A) nounwind {
+;CHECK-LABEL: v_shuffledupQ32:
+;CHECK: dup.4s
+ %tmp1 = insertelement <4 x i32> undef, i32 %A, i32 0
+ %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer
+ ret <4 x i32> %tmp2
+}
+
+define <4 x float> @v_shuffledupQfloat(float %A) nounwind {
+;CHECK-LABEL: v_shuffledupQfloat:
+;CHECK: dup.4s
+ %tmp1 = insertelement <4 x float> undef, float %A, i32 0
+ %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer
+ ret <4 x float> %tmp2
+}
+
+define <8 x i8> @vduplane8(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: vduplane8:
+;CHECK: dup.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
+ ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vduplane16(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: vduplane16:
+;CHECK: dup.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
+ ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vduplane32(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: vduplane32:
+;CHECK: dup.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> < i32 1, i32 1 >
+ ret <2 x i32> %tmp2
+}
+
+define <2 x float> @vduplanefloat(<2 x float>* %A) nounwind {
+;CHECK-LABEL: vduplanefloat:
+;CHECK: dup.2s
+ %tmp1 = load <2 x float>* %A
+ %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> < i32 1, i32 1 >
+ ret <2 x float> %tmp2
+}
+
+define <16 x i8> @vduplaneQ8(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: vduplaneQ8:
+;CHECK: dup.16b
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
+ ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vduplaneQ16(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: vduplaneQ16:
+;CHECK: dup.8h
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
+ ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vduplaneQ32(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: vduplaneQ32:
+;CHECK: dup.4s
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
+ ret <4 x i32> %tmp2
+}
+
+define <4 x float> @vduplaneQfloat(<2 x float>* %A) nounwind {
+;CHECK-LABEL: vduplaneQfloat:
+;CHECK: dup.4s
+ %tmp1 = load <2 x float>* %A
+ %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
+ ret <4 x float> %tmp2
+}
+
+define <2 x i64> @foo(<2 x i64> %arg0_int64x1_t) nounwind readnone {
+;CHECK-LABEL: foo:
+;CHECK: dup.2d
+entry:
+ %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
+ ret <2 x i64> %0
+}
+
+define <2 x i64> @bar(<2 x i64> %arg0_int64x1_t) nounwind readnone {
+;CHECK-LABEL: bar:
+;CHECK: dup.2d
+entry:
+ %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+ ret <2 x i64> %0
+}
+
+define <2 x double> @baz(<2 x double> %arg0_int64x1_t) nounwind readnone {
+;CHECK-LABEL: baz:
+;CHECK: dup.2d
+entry:
+ %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+ ret <2 x double> %0
+}
+
+define <2 x double> @qux(<2 x double> %arg0_int64x1_t) nounwind readnone {
+;CHECK-LABEL: qux:
+;CHECK: dup.2d
+entry:
+ %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+ ret <2 x double> %0
+}
+
+define <2 x i32> @f(i32 %a, i32 %b) nounwind readnone {
+; CHECK-LABEL: f:
+; CHECK-NEXT: fmov s0, w0
+; CHECK-NEXT: ins.s v0[1], w1
+; CHECK-NEXT: ret
+ %vecinit = insertelement <2 x i32> undef, i32 %a, i32 0
+ %vecinit1 = insertelement <2 x i32> %vecinit, i32 %b, i32 1
+ ret <2 x i32> %vecinit1
+}
+
+define <4 x i32> @g(i32 %a, i32 %b) nounwind readnone {
+; CHECK-LABEL: g:
+; CHECK-NEXT: fmov s0, w0
+; CHECK-NEXT: ins.s v0[1], w1
+; CHECK-NEXT: ins.s v0[2], w1
+; CHECK-NEXT: ins.s v0[3], w0
+; CHECK-NEXT: ret
+ %vecinit = insertelement <4 x i32> undef, i32 %a, i32 0
+ %vecinit1 = insertelement <4 x i32> %vecinit, i32 %b, i32 1
+ %vecinit2 = insertelement <4 x i32> %vecinit1, i32 %b, i32 2
+ %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %a, i32 3
+ ret <4 x i32> %vecinit3
+}
+
+define <2 x i64> @h(i64 %a, i64 %b) nounwind readnone {
+; CHECK-LABEL: h:
+; CHECK-NEXT: fmov d0, x0
+; CHECK-NEXT: ins.d v0[1], x1
+; CHECK-NEXT: ret
+ %vecinit = insertelement <2 x i64> undef, i64 %a, i32 0
+ %vecinit1 = insertelement <2 x i64> %vecinit, i64 %b, i32 1
+ ret <2 x i64> %vecinit1
+}
+
+; We used to spot this as a BUILD_VECTOR implementable by dup, but assume that
+; the single value needed was of the same type as the vector. This is false if
+; the scalar corresponding to the vector type is illegal (e.g. a <4 x i16>
+; BUILD_VECTOR will have an i32 as its source). In that case, the operation is
+; not a simple "dup vD.4h, vN.h[idx]" after all, and we crashed.
+;
+; *However*, it is a dup vD.4h, vN.h[2*idx].
+define <4 x i16> @test_build_illegal(<4 x i32> %in) {
+; CHECK-LABEL: test_build_illegal:
+; CHECK: dup.4h v0, v0[6]
+ %val = extractelement <4 x i32> %in, i32 3
+ %smallval = trunc i32 %val to i16
+ %vec = insertelement <4x i16> undef, i16 %smallval, i32 3
+
+ ret <4 x i16> %vec
+}
+
+; We used to inherit an already extract_subvectored v4i16 from
+; SelectionDAGBuilder here. We then added a DUPLANE on top of that, preventing
+; the formation of an indexed-by-7 MLS.
+define <4 x i16> @test_high_splat(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+; CHECK-LABEL: test_high_splat:
+; CHECK: mls.4h v0, v1, v2[7]
+entry:
+ %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+ %mul = mul <4 x i16> %shuffle, %b
+ %sub = sub <4 x i16> %a, %mul
+ ret <4 x i16> %sub
+}
diff --git a/test/CodeGen/AArch64/arm64-early-ifcvt.ll b/test/CodeGen/AArch64/arm64-early-ifcvt.ll
new file mode 100644
index 0000000..17d783a
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-early-ifcvt.ll
@@ -0,0 +1,423 @@
+; RUN: llc < %s -stress-early-ifcvt | FileCheck %s
+target triple = "arm64-apple-macosx"
+
+; CHECK: mm2
+define i32 @mm2(i32* nocapture %p, i32 %n) nounwind uwtable readonly ssp {
+entry:
+ br label %do.body
+
+; CHECK: do.body
+; Loop body has no branches before the backedge.
+; CHECK-NOT: LBB
+do.body:
+ %max.0 = phi i32 [ 0, %entry ], [ %max.1, %do.cond ]
+ %min.0 = phi i32 [ 0, %entry ], [ %min.1, %do.cond ]
+ %n.addr.0 = phi i32 [ %n, %entry ], [ %dec, %do.cond ]
+ %p.addr.0 = phi i32* [ %p, %entry ], [ %incdec.ptr, %do.cond ]
+ %incdec.ptr = getelementptr inbounds i32* %p.addr.0, i64 1
+ %0 = load i32* %p.addr.0, align 4
+ %cmp = icmp sgt i32 %0, %max.0
+ br i1 %cmp, label %do.cond, label %if.else
+
+if.else:
+ %cmp1 = icmp slt i32 %0, %min.0
+ %.min.0 = select i1 %cmp1, i32 %0, i32 %min.0
+ br label %do.cond
+
+do.cond:
+ %max.1 = phi i32 [ %0, %do.body ], [ %max.0, %if.else ]
+ %min.1 = phi i32 [ %min.0, %do.body ], [ %.min.0, %if.else ]
+; CHECK: cbnz
+ %dec = add i32 %n.addr.0, -1
+ %tobool = icmp eq i32 %dec, 0
+ br i1 %tobool, label %do.end, label %do.body
+
+do.end:
+ %sub = sub nsw i32 %max.1, %min.1
+ ret i32 %sub
+}
+
+; CHECK-LABEL: fold_inc_true_32:
+; CHECK: {{subs.*wzr,|cmp}} w2, #1
+; CHECK-NEXT: csinc w0, w1, w0, eq
+; CHECK-NEXT: ret
+define i32 @fold_inc_true_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+ %tobool = icmp eq i32 %c, 1
+ %inc = add nsw i32 %x, 1
+ br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+ br label %done
+
+done:
+ %cond = phi i32 [ %y, %eq_bb ], [ %inc, %entry ]
+ ret i32 %cond
+}
+
+; CHECK-LABEL: fold_inc_true_64:
+; CHECK: {{subs.*xzr,|cmp}} x2, #1
+; CHECK-NEXT: csinc x0, x1, x0, eq
+; CHECK-NEXT: ret
+define i64 @fold_inc_true_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+ %tobool = icmp eq i64 %c, 1
+ %inc = add nsw i64 %x, 1
+ br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+ br label %done
+
+done:
+ %cond = phi i64 [ %y, %eq_bb ], [ %inc, %entry ]
+ ret i64 %cond
+}
+
+; CHECK-LABEL: fold_inc_false_32:
+; CHECK: {{subs.*wzr,|cmp}} w2, #1
+; CHECK-NEXT: csinc w0, w1, w0, ne
+; CHECK-NEXT: ret
+define i32 @fold_inc_false_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+ %tobool = icmp eq i32 %c, 1
+ %inc = add nsw i32 %x, 1
+ br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+ br label %done
+
+done:
+ %cond = phi i32 [ %inc, %eq_bb ], [ %y, %entry ]
+ ret i32 %cond
+}
+
+; CHECK-LABEL: fold_inc_false_64:
+; CHECK: {{subs.*xzr,|cmp}} x2, #1
+; CHECK-NEXT: csinc x0, x1, x0, ne
+; CHECK-NEXT: ret
+define i64 @fold_inc_false_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+ %tobool = icmp eq i64 %c, 1
+ %inc = add nsw i64 %x, 1
+ br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+ br label %done
+
+done:
+ %cond = phi i64 [ %inc, %eq_bb ], [ %y, %entry ]
+ ret i64 %cond
+}
+
+; CHECK-LABEL: fold_inv_true_32:
+; CHECK: {{subs.*wzr,|cmp}} w2, #1
+; CHECK-NEXT: csinv w0, w1, w0, eq
+; CHECK-NEXT: ret
+define i32 @fold_inv_true_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+ %tobool = icmp eq i32 %c, 1
+ %inv = xor i32 %x, -1
+ br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+ br label %done
+
+done:
+ %cond = phi i32 [ %y, %eq_bb ], [ %inv, %entry ]
+ ret i32 %cond
+}
+
+; CHECK-LABEL: fold_inv_true_64:
+; CHECK: {{subs.*xzr,|cmp}} x2, #1
+; CHECK-NEXT: csinv x0, x1, x0, eq
+; CHECK-NEXT: ret
+define i64 @fold_inv_true_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+ %tobool = icmp eq i64 %c, 1
+ %inv = xor i64 %x, -1
+ br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+ br label %done
+
+done:
+ %cond = phi i64 [ %y, %eq_bb ], [ %inv, %entry ]
+ ret i64 %cond
+}
+
+; CHECK-LABEL: fold_inv_false_32:
+; CHECK: {{subs.*wzr,|cmp}} w2, #1
+; CHECK-NEXT: csinv w0, w1, w0, ne
+; CHECK-NEXT: ret
+define i32 @fold_inv_false_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+ %tobool = icmp eq i32 %c, 1
+ %inv = xor i32 %x, -1
+ br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+ br label %done
+
+done:
+ %cond = phi i32 [ %inv, %eq_bb ], [ %y, %entry ]
+ ret i32 %cond
+}
+
+; CHECK-LABEL: fold_inv_false_64:
+; CHECK: {{subs.*xzr,|cmp}} x2, #1
+; CHECK-NEXT: csinv x0, x1, x0, ne
+; CHECK-NEXT: ret
+define i64 @fold_inv_false_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+ %tobool = icmp eq i64 %c, 1
+ %inv = xor i64 %x, -1
+ br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+ br label %done
+
+done:
+ %cond = phi i64 [ %inv, %eq_bb ], [ %y, %entry ]
+ ret i64 %cond
+}
+
+; CHECK-LABEL: fold_neg_true_32:
+; CHECK: {{subs.*wzr,|cmp}} w2, #1
+; CHECK-NEXT: csneg w0, w1, w0, eq
+; CHECK-NEXT: ret
+define i32 @fold_neg_true_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+ %tobool = icmp eq i32 %c, 1
+ %neg = sub nsw i32 0, %x
+ br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+ br label %done
+
+done:
+ %cond = phi i32 [ %y, %eq_bb ], [ %neg, %entry ]
+ ret i32 %cond
+}
+
+; CHECK-LABEL: fold_neg_true_64:
+; CHECK: {{subs.*xzr,|cmp}} x2, #1
+; CHECK-NEXT: csneg x0, x1, x0, eq
+; CHECK-NEXT: ret
+define i64 @fold_neg_true_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+ %tobool = icmp eq i64 %c, 1
+ %neg = sub nsw i64 0, %x
+ br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+ br label %done
+
+done:
+ %cond = phi i64 [ %y, %eq_bb ], [ %neg, %entry ]
+ ret i64 %cond
+}
+
+; CHECK-LABEL: fold_neg_false_32:
+; CHECK: {{subs.*wzr,|cmp}} w2, #1
+; CHECK-NEXT: csneg w0, w1, w0, ne
+; CHECK-NEXT: ret
+define i32 @fold_neg_false_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+ %tobool = icmp eq i32 %c, 1
+ %neg = sub nsw i32 0, %x
+ br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+ br label %done
+
+done:
+ %cond = phi i32 [ %neg, %eq_bb ], [ %y, %entry ]
+ ret i32 %cond
+}
+
+; CHECK-LABEL: fold_neg_false_64:
+; CHECK: {{subs.*xzr,|cmp}} x2, #1
+; CHECK-NEXT: csneg x0, x1, x0, ne
+; CHECK-NEXT: ret
+define i64 @fold_neg_false_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+ %tobool = icmp eq i64 %c, 1
+ %neg = sub nsw i64 0, %x
+ br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+ br label %done
+
+done:
+ %cond = phi i64 [ %neg, %eq_bb ], [ %y, %entry ]
+ ret i64 %cond
+}
+
+; CHECK: cbnz_32
+; CHECK: {{subs.*wzr,|cmp}} w2, #0
+; CHECK-NEXT: csel w0, w1, w0, ne
+; CHECK-NEXT: ret
+define i32 @cbnz_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+ %tobool = icmp eq i32 %c, 0
+ br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+ br label %done
+
+done:
+ %cond = phi i32 [ %x, %eq_bb ], [ %y, %entry ]
+ ret i32 %cond
+}
+
+; CHECK: cbnz_64
+; CHECK: {{subs.*xzr,|cmp}} x2, #0
+; CHECK-NEXT: csel x0, x1, x0, ne
+; CHECK-NEXT: ret
+define i64 @cbnz_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+ %tobool = icmp eq i64 %c, 0
+ br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+ br label %done
+
+done:
+ %cond = phi i64 [ %x, %eq_bb ], [ %y, %entry ]
+ ret i64 %cond
+}
+
+; CHECK: cbz_32
+; CHECK: {{subs.*wzr,|cmp}} w2, #0
+; CHECK-NEXT: csel w0, w1, w0, eq
+; CHECK-NEXT: ret
+define i32 @cbz_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+ %tobool = icmp ne i32 %c, 0
+ br i1 %tobool, label %ne_bb, label %done
+
+ne_bb:
+ br label %done
+
+done:
+ %cond = phi i32 [ %x, %ne_bb ], [ %y, %entry ]
+ ret i32 %cond
+}
+
+; CHECK: cbz_64
+; CHECK: {{subs.*xzr,|cmp}} x2, #0
+; CHECK-NEXT: csel x0, x1, x0, eq
+; CHECK-NEXT: ret
+define i64 @cbz_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+ %tobool = icmp ne i64 %c, 0
+ br i1 %tobool, label %ne_bb, label %done
+
+ne_bb:
+ br label %done
+
+done:
+ %cond = phi i64 [ %x, %ne_bb ], [ %y, %entry ]
+ ret i64 %cond
+}
+
+; CHECK: tbnz_32
+; CHECK: {{ands.*xzr,|tst}} w2, #0x80
+; CHECK-NEXT: csel w0, w1, w0, ne
+; CHECK-NEXT: ret
+define i32 @tbnz_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+ %mask = and i32 %c, 128
+ %tobool = icmp eq i32 %mask, 0
+ br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+ br label %done
+
+done:
+ %cond = phi i32 [ %x, %eq_bb ], [ %y, %entry ]
+ ret i32 %cond
+}
+
+; CHECK: tbnz_64
+; CHECK: {{ands.*xzr,|tst}} x2, #0x8000000000000000
+; CHECK-NEXT: csel x0, x1, x0, ne
+; CHECK-NEXT: ret
+define i64 @tbnz_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+ %mask = and i64 %c, 9223372036854775808
+ %tobool = icmp eq i64 %mask, 0
+ br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+ br label %done
+
+done:
+ %cond = phi i64 [ %x, %eq_bb ], [ %y, %entry ]
+ ret i64 %cond
+}
+
+; CHECK: tbz_32
+; CHECK: {{ands.*xzr,|tst}} w2, #0x80
+; CHECK-NEXT: csel w0, w1, w0, eq
+; CHECK-NEXT: ret
+define i32 @tbz_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+ %mask = and i32 %c, 128
+ %tobool = icmp ne i32 %mask, 0
+ br i1 %tobool, label %ne_bb, label %done
+
+ne_bb:
+ br label %done
+
+done:
+ %cond = phi i32 [ %x, %ne_bb ], [ %y, %entry ]
+ ret i32 %cond
+}
+
+; CHECK: tbz_64
+; CHECK: {{ands.*xzr,|tst}} x2, #0x8000000000000000
+; CHECK-NEXT: csel x0, x1, x0, eq
+; CHECK-NEXT: ret
+define i64 @tbz_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+ %mask = and i64 %c, 9223372036854775808
+ %tobool = icmp ne i64 %mask, 0
+ br i1 %tobool, label %ne_bb, label %done
+
+ne_bb:
+ br label %done
+
+done:
+ %cond = phi i64 [ %x, %ne_bb ], [ %y, %entry ]
+ ret i64 %cond
+}
+
+; This function from 175.vpr folds an ADDWri into a CSINC.
+; Remember to clear the kill flag on the ADDWri.
+define i32 @get_ytrack_to_xtracks() nounwind ssp {
+entry:
+ br label %for.body
+
+for.body:
+ %x0 = load i32* undef, align 4
+ br i1 undef, label %if.then.i146, label %is_sbox.exit155
+
+if.then.i146:
+ %add8.i143 = add nsw i32 0, %x0
+ %rem.i144 = srem i32 %add8.i143, %x0
+ %add9.i145 = add i32 %rem.i144, 1
+ br label %is_sbox.exit155
+
+is_sbox.exit155: ; preds = %if.then.i146, %for.body
+ %seg_offset.0.i151 = phi i32 [ %add9.i145, %if.then.i146 ], [ undef, %for.body ]
+ %idxprom15.i152 = sext i32 %seg_offset.0.i151 to i64
+ %arrayidx18.i154 = getelementptr inbounds i32* null, i64 %idxprom15.i152
+ %x1 = load i32* %arrayidx18.i154, align 4
+ br i1 undef, label %for.body51, label %for.body
+
+for.body51: ; preds = %is_sbox.exit155
+ call fastcc void @get_switch_type(i32 %x1, i32 undef, i16 signext undef, i16 signext undef, i16* undef)
+ unreachable
+}
+declare fastcc void @get_switch_type(i32, i32, i16 signext, i16 signext, i16* nocapture) nounwind ssp
diff --git a/test/CodeGen/AArch64/arm64-elf-calls.ll b/test/CodeGen/AArch64/arm64-elf-calls.ll
new file mode 100644
index 0000000..8c40203
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-elf-calls.ll
@@ -0,0 +1,20 @@
+; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu -filetype=obj -o - %s | llvm-objdump -triple=arm64-linux-gnu - -r | FileCheck %s --check-prefix=CHECK-OBJ
+
+declare void @callee()
+
+define void @caller() {
+ call void @callee()
+ ret void
+; CHECK-LABEL: caller:
+; CHECK: bl callee
+; CHECK-OBJ: R_AARCH64_CALL26 callee
+}
+
+define void @tail_caller() {
+ tail call void @callee()
+ ret void
+; CHECK-LABEL: tail_caller:
+; CHECK: b callee
+; CHECK-OBJ: R_AARCH64_JUMP26 callee
+}
diff --git a/test/CodeGen/AArch64/arm64-elf-constpool.ll b/test/CodeGen/AArch64/arm64-elf-constpool.ll
new file mode 100644
index 0000000..95d3343
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-elf-constpool.ll
@@ -0,0 +1,13 @@
+; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu -O0 -o - %s | FileCheck %s
+
+; O0 checked for fastisel purposes. It has a separate path which
+; creates a constpool entry for floating values.
+
+define double @needs_const() {
+ ret double 3.14159
+; CHECK: .LCPI0_0:
+
+; CHECK: adrp {{x[0-9]+}}, .LCPI0_0
+; CHECK: ldr d0, [{{x[0-9]+}}, :lo12:.LCPI0_0]
+}
diff --git a/test/CodeGen/AArch64/arm64-elf-globals.ll b/test/CodeGen/AArch64/arm64-elf-globals.ll
new file mode 100644
index 0000000..4ed44e7
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-elf-globals.ll
@@ -0,0 +1,115 @@
+; RUN: llc -mtriple=arm64-linux-gnu -o - %s -mcpu=cyclone | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu -o - %s -O0 -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-FAST
+; RUN: llc -mtriple=arm64-linux-gnu -relocation-model=pic -o - %s -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-PIC
+; RUN: llc -mtriple=arm64-linux-gnu -O0 -relocation-model=pic -o - %s -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-FAST-PIC
+
+@var8 = external global i8, align 1
+@var16 = external global i16, align 2
+@var32 = external global i32, align 4
+@var64 = external global i64, align 8
+
+define i8 @test_i8(i8 %new) {
+ %val = load i8* @var8, align 1
+ store i8 %new, i8* @var8
+ ret i8 %val
+; CHECK-LABEL: test_i8:
+; CHECK: adrp x[[HIREG:[0-9]+]], var8
+; CHECK: ldrb {{w[0-9]+}}, [x[[HIREG]], :lo12:var8]
+; CHECK: strb {{w[0-9]+}}, [x[[HIREG]], :lo12:var8]
+
+; CHECK-PIC-LABEL: test_i8:
+; CHECK-PIC: adrp x[[HIREG:[0-9]+]], :got:var8
+; CHECK-PIC: ldr x[[VAR_ADDR:[0-9]+]], [x[[HIREG]], :got_lo12:var8]
+; CHECK-PIC: ldrb {{w[0-9]+}}, [x[[VAR_ADDR]]]
+
+; CHECK-FAST: adrp x[[HIREG:[0-9]+]], var8
+; CHECK-FAST: ldrb {{w[0-9]+}}, [x[[HIREG]], :lo12:var8]
+
+; CHECK-FAST-PIC: adrp x[[HIREG:[0-9]+]], :got:var8
+; CHECK-FAST-PIC: ldr x[[VARADDR:[0-9]+]], [x[[HIREG]], :got_lo12:var8]
+; CHECK-FAST-PIC: ldr {{w[0-9]+}}, [x[[VARADDR]]]
+}
+
+define i16 @test_i16(i16 %new) {
+ %val = load i16* @var16, align 2
+ store i16 %new, i16* @var16
+ ret i16 %val
+; CHECK-LABEL: test_i16:
+; CHECK: adrp x[[HIREG:[0-9]+]], var16
+; CHECK: ldrh {{w[0-9]+}}, [x[[HIREG]], :lo12:var16]
+; CHECK: strh {{w[0-9]+}}, [x[[HIREG]], :lo12:var16]
+
+; CHECK-FAST: adrp x[[HIREG:[0-9]+]], var16
+; CHECK-FAST: ldrh {{w[0-9]+}}, [x[[HIREG]], :lo12:var16]
+}
+
+define i32 @test_i32(i32 %new) {
+ %val = load i32* @var32, align 4
+ store i32 %new, i32* @var32
+ ret i32 %val
+; CHECK-LABEL: test_i32:
+; CHECK: adrp x[[HIREG:[0-9]+]], var32
+; CHECK: ldr {{w[0-9]+}}, [x[[HIREG]], :lo12:var32]
+; CHECK: str {{w[0-9]+}}, [x[[HIREG]], :lo12:var32]
+
+; CHECK-FAST: adrp x[[HIREG:[0-9]+]], var32
+; CHECK-FAST: add {{x[0-9]+}}, x[[HIREG]], :lo12:var32
+}
+
+define i64 @test_i64(i64 %new) {
+ %val = load i64* @var64, align 8
+ store i64 %new, i64* @var64
+ ret i64 %val
+; CHECK-LABEL: test_i64:
+; CHECK: adrp x[[HIREG:[0-9]+]], var64
+; CHECK: ldr {{x[0-9]+}}, [x[[HIREG]], :lo12:var64]
+; CHECK: str {{x[0-9]+}}, [x[[HIREG]], :lo12:var64]
+
+; CHECK-FAST: adrp x[[HIREG:[0-9]+]], var64
+; CHECK-FAST: add {{x[0-9]+}}, x[[HIREG]], :lo12:var64
+}
+
+define i64* @test_addr() {
+ ret i64* @var64
+; CHECK-LABEL: test_addr:
+; CHECK: adrp [[HIREG:x[0-9]+]], var64
+; CHECK: add x0, [[HIREG]], :lo12:var64
+
+; CHECK-FAST: adrp [[HIREG:x[0-9]+]], var64
+; CHECK-FAST: add x0, [[HIREG]], :lo12:var64
+}
+
+@hiddenvar = hidden global i32 0, align 4
+@protectedvar = protected global i32 0, align 4
+
+define i32 @test_vis() {
+ %lhs = load i32* @hiddenvar, align 4
+ %rhs = load i32* @protectedvar, align 4
+ %ret = add i32 %lhs, %rhs
+ ret i32 %ret
+; CHECK-PIC: adrp {{x[0-9]+}}, hiddenvar
+; CHECK-PIC: ldr {{w[0-9]+}}, [{{x[0-9]+}}, :lo12:hiddenvar]
+; CHECK-PIC: adrp {{x[0-9]+}}, protectedvar
+; CHECK-PIC: ldr {{w[0-9]+}}, [{{x[0-9]+}}, :lo12:protectedvar]
+}
+
+@var_default = external global [2 x i32]
+
+define i32 @test_default_align() {
+ %addr = getelementptr [2 x i32]* @var_default, i32 0, i32 0
+ %val = load i32* %addr
+ ret i32 %val
+; CHECK-LABEL: test_default_align:
+; CHECK: adrp x[[HIREG:[0-9]+]], var_default
+; CHECK: ldr w0, [x[[HIREG]], :lo12:var_default]
+}
+
+define i64 @test_default_unaligned() {
+ %addr = bitcast [2 x i32]* @var_default to i64*
+ %val = load i64* %addr
+ ret i64 %val
+; CHECK-LABEL: test_default_unaligned:
+; CHECK: adrp [[HIREG:x[0-9]+]], var_default
+; CHECK: add x[[ADDR:[0-9]+]], [[HIREG]], :lo12:var_default
+; CHECK: ldr x0, [x[[ADDR]]]
+}
diff --git a/test/CodeGen/AArch64/arm64-ext.ll b/test/CodeGen/AArch64/arm64-ext.ll
new file mode 100644
index 0000000..67860de
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-ext.ll
@@ -0,0 +1,118 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @test_vextd(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: test_vextd:
+;CHECK: {{ext.8b.*#3}}
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+ ret <8 x i8> %tmp3
+}
+
+define <8 x i8> @test_vextRd(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: test_vextRd:
+;CHECK: {{ext.8b.*#5}}
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4>
+ ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @test_vextq(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: test_vextq:
+;CHECK: {{ext.16b.*3}}
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18>
+ ret <16 x i8> %tmp3
+}
+
+define <16 x i8> @test_vextRq(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: test_vextRq:
+;CHECK: {{ext.16b.*7}}
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
+ ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @test_vextd16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: test_vextd16:
+;CHECK: {{ext.8b.*#6}}
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+ ret <4 x i16> %tmp3
+}
+
+define <4 x i32> @test_vextq32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: test_vextq32:
+;CHECK: {{ext.16b.*12}}
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+ ret <4 x i32> %tmp3
+}
+
+; Undef shuffle indices should not prevent matching to VEXT:
+
+define <8 x i8> @test_vextd_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: test_vextd_undef:
+;CHECK: {{ext.8b.*}}
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 3, i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10>
+ ret <8 x i8> %tmp3
+}
+
+define <8 x i8> @test_vextd_undef2(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: test_vextd_undef2:
+;CHECK: {{ext.8b.*#6}}
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 3, i32 4, i32 5>
+ ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @test_vextRq_undef(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: test_vextRq_undef:
+;CHECK: {{ext.16b.*#7}}
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 23, i32 24, i32 25, i32 26, i32 undef, i32 undef, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 undef, i32 6>
+ ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @test_vextRq_undef2(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: test_vextRq_undef2:
+;CHECK: {{ext.16b.*#10}}
+ %tmp1 = load <8 x i16>* %A
+ %vext = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 2, i32 3, i32 4>
+ ret <8 x i16> %vext;
+}
+
+; Tests for ReconstructShuffle function. Indices have to be carefully
+; chosen to reach lowering phase as a BUILD_VECTOR.
+
+; One vector needs vext, the other can be handled by extract_subvector
+; Also checks interleaving of sources is handled correctly.
+; Essence: a vext is used on %A and something saner than stack load/store for final result.
+define <4 x i16> @test_interleaved(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: test_interleaved:
+;CHECK: ext.8b
+;CHECK: zip1.4h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <4 x i32> <i32 3, i32 8, i32 5, i32 9>
+ ret <4 x i16> %tmp3
+}
+
+; An undef in the shuffle list should still be optimizable
+define <4 x i16> @test_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: test_undef:
+;CHECK: zip1.4h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <4 x i32> <i32 undef, i32 8, i32 5, i32 9>
+ ret <4 x i16> %tmp3
+}
diff --git a/test/CodeGen/AArch64/arm64-extend-int-to-fp.ll b/test/CodeGen/AArch64/arm64-extend-int-to-fp.ll
new file mode 100644
index 0000000..048fdb0
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-extend-int-to-fp.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <4 x float> @foo(<4 x i16> %a) nounwind {
+; CHECK-LABEL: foo:
+; CHECK: ushll.4s v0, v0, #0
+; CHECK-NEXT: ucvtf.4s v0, v0
+; CHECK-NEXT: ret
+ %vcvt.i = uitofp <4 x i16> %a to <4 x float>
+ ret <4 x float> %vcvt.i
+}
+
+define <4 x float> @bar(<4 x i16> %a) nounwind {
+; CHECK-LABEL: bar:
+; CHECK: sshll.4s v0, v0, #0
+; CHECK-NEXT: scvtf.4s v0, v0
+; CHECK-NEXT: ret
+ %vcvt.i = sitofp <4 x i16> %a to <4 x float>
+ ret <4 x float> %vcvt.i
+}
diff --git a/test/CodeGen/AArch64/arm64-extend.ll b/test/CodeGen/AArch64/arm64-extend.ll
new file mode 100644
index 0000000..afcaca2
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-extend.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios | FileCheck %s
+@array = external global [0 x i32]
+
+define i64 @foo(i32 %i) {
+; CHECK: foo
+; CHECK: adrp x[[REG:[0-9]+]], _array@GOTPAGE
+; CHECK: ldr x[[REG1:[0-9]+]], [x[[REG]], _array@GOTPAGEOFF]
+; CHECK: ldrsw x0, [x[[REG1]], w0, sxtw #2]
+; CHECK: ret
+ %idxprom = sext i32 %i to i64
+ %arrayidx = getelementptr inbounds [0 x i32]* @array, i64 0, i64 %idxprom
+ %tmp1 = load i32* %arrayidx, align 4
+ %conv = sext i32 %tmp1 to i64
+ ret i64 %conv
+}
diff --git a/test/CodeGen/AArch64/arm64-extern-weak.ll b/test/CodeGen/AArch64/arm64-extern-weak.ll
new file mode 100644
index 0000000..a239403
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-extern-weak.ll
@@ -0,0 +1,51 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu -o - < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-none-linux-gnu -code-model=large -o - < %s | FileCheck --check-prefix=CHECK-LARGE %s
+
+declare extern_weak i32 @var()
+
+define i32()* @foo() {
+; The usual ADRP/ADD pair can't be used for a weak reference because it must
+; evaluate to 0 if the symbol is undefined. We use a litpool entry.
+ ret i32()* @var
+
+; CHECK: adrp x[[VAR:[0-9]+]], :got:var
+; CHECK: ldr x0, [x[[VAR]], :got_lo12:var]
+
+ ; In the large model, the usual relocations are absolute and can
+ ; materialise 0.
+; CHECK-LARGE: movz x0, #:abs_g3:var
+; CHECK-LARGE: movk x0, #:abs_g2_nc:var
+; CHECK-LARGE: movk x0, #:abs_g1_nc:var
+; CHECK-LARGE: movk x0, #:abs_g0_nc:var
+}
+
+
+@arr_var = extern_weak global [10 x i32]
+
+define i32* @bar() {
+ %addr = getelementptr [10 x i32]* @arr_var, i32 0, i32 5
+; CHECK: adrp x[[ARR_VAR_HI:[0-9]+]], :got:arr_var
+; CHECK: ldr [[ARR_VAR:x[0-9]+]], [x[[ARR_VAR_HI]], :got_lo12:arr_var]
+; CHECK: add x0, [[ARR_VAR]], #20
+ ret i32* %addr
+
+ ; In the large model, the usual relocations are absolute and can
+ ; materialise 0.
+; CHECK-LARGE: movz [[ARR_VAR:x[0-9]+]], #:abs_g3:arr_var
+; CHECK-LARGE: movk [[ARR_VAR]], #:abs_g2_nc:arr_var
+; CHECK-LARGE: movk [[ARR_VAR]], #:abs_g1_nc:arr_var
+; CHECK-LARGE: movk [[ARR_VAR]], #:abs_g0_nc:arr_var
+}
+
+@defined_weak_var = internal unnamed_addr global i32 0
+
+define i32* @wibble() {
+ ret i32* @defined_weak_var
+; CHECK: adrp [[BASE:x[0-9]+]], defined_weak_var
+; CHECK: add x0, [[BASE]], :lo12:defined_weak_var
+
+; CHECK-LARGE: movz x0, #:abs_g3:defined_weak_var
+; CHECK-LARGE: movk x0, #:abs_g2_nc:defined_weak_var
+; CHECK-LARGE: movk x0, #:abs_g1_nc:defined_weak_var
+; CHECK-LARGE: movk x0, #:abs_g0_nc:defined_weak_var
+}
diff --git a/test/CodeGen/AArch64/arm64-extload-knownzero.ll b/test/CodeGen/AArch64/arm64-extload-knownzero.ll
new file mode 100644
index 0000000..14e5fd3
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-extload-knownzero.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+; rdar://12771555
+
+define void @foo(i16* %ptr, i32 %a) nounwind {
+entry:
+; CHECK-LABEL: foo:
+ %tmp1 = icmp ult i32 %a, 100
+ br i1 %tmp1, label %bb1, label %bb2
+bb1:
+; CHECK: %bb1
+; CHECK: ldrh [[REG:w[0-9]+]]
+ %tmp2 = load i16* %ptr, align 2
+ br label %bb2
+bb2:
+; CHECK: %bb2
+; CHECK-NOT: and {{w[0-9]+}}, [[REG]], #0xffff
+; CHECK: cmp [[REG]], #23
+ %tmp3 = phi i16 [ 0, %entry ], [ %tmp2, %bb1 ]
+ %cmp = icmp ult i16 %tmp3, 24
+ br i1 %cmp, label %bb3, label %exit
+bb3:
+ call void @bar() nounwind
+ br label %exit
+exit:
+ ret void
+}
+
+declare void @bar ()
diff --git a/test/CodeGen/AArch64/arm64-extract.ll b/test/CodeGen/AArch64/arm64-extract.ll
new file mode 100644
index 0000000..0198466
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-extract.ll
@@ -0,0 +1,58 @@
+; RUN: llc -aarch64-extr-generation=true -verify-machineinstrs < %s \
+; RUN: -march=arm64 | FileCheck %s
+
+define i64 @ror_i64(i64 %in) {
+; CHECK-LABEL: ror_i64:
+ %left = shl i64 %in, 19
+ %right = lshr i64 %in, 45
+ %val5 = or i64 %left, %right
+; CHECK: ror {{x[0-9]+}}, x0, #45
+ ret i64 %val5
+}
+
+define i32 @ror_i32(i32 %in) {
+; CHECK-LABEL: ror_i32:
+ %left = shl i32 %in, 9
+ %right = lshr i32 %in, 23
+ %val5 = or i32 %left, %right
+; CHECK: ror {{w[0-9]+}}, w0, #23
+ ret i32 %val5
+}
+
+define i32 @extr_i32(i32 %lhs, i32 %rhs) {
+; CHECK-LABEL: extr_i32:
+ %left = shl i32 %lhs, 6
+ %right = lshr i32 %rhs, 26
+ %val = or i32 %left, %right
+ ; Order of lhs and rhs matters here. Regalloc would have to be very odd to use
+ ; something other than w0 and w1.
+; CHECK: extr {{w[0-9]+}}, w0, w1, #26
+
+ ret i32 %val
+}
+
+define i64 @extr_i64(i64 %lhs, i64 %rhs) {
+; CHECK-LABEL: extr_i64:
+ %right = lshr i64 %rhs, 40
+ %left = shl i64 %lhs, 24
+ %val = or i64 %right, %left
+ ; Order of lhs and rhs matters here. Regalloc would have to be very odd to use
+ ; something other than w0 and w1.
+; CHECK: extr {{x[0-9]+}}, x0, x1, #40
+
+ ret i64 %val
+}
+
+; Regression test: a bad experimental pattern crept into git which optimised
+; this pattern to a single EXTR.
+define i32 @extr_regress(i32 %a, i32 %b) {
+; CHECK-LABEL: extr_regress:
+
+ %sh1 = shl i32 %a, 14
+ %sh2 = lshr i32 %b, 14
+ %val = or i32 %sh2, %sh1
+; CHECK-NOT: extr {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, #{{[0-9]+}}
+
+ ret i32 %val
+; CHECK: ret
+}
diff --git a/test/CodeGen/AArch64/arm64-extract_subvector.ll b/test/CodeGen/AArch64/arm64-extract_subvector.ll
new file mode 100644
index 0000000..8b15a64
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-extract_subvector.ll
@@ -0,0 +1,51 @@
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s
+
+; Extract of an upper half of a vector is an "ext.16b v0, v0, v0, #8" insn.
+
+define <8 x i8> @v8i8(<16 x i8> %a) nounwind {
+; CHECK: v8i8
+; CHECK: ext.16b v0, v0, v0, #8
+; CHECK: ret
+ %ret = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <8 x i8> %ret
+}
+
+define <4 x i16> @v4i16(<8 x i16> %a) nounwind {
+; CHECK-LABEL: v4i16:
+; CHECK: ext.16b v0, v0, v0, #8
+; CHECK: ret
+ %ret = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ ret <4 x i16> %ret
+}
+
+define <2 x i32> @v2i32(<4 x i32> %a) nounwind {
+; CHECK-LABEL: v2i32:
+; CHECK: ext.16b v0, v0, v0, #8
+; CHECK: ret
+ %ret = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+ ret <2 x i32> %ret
+}
+
+define <1 x i64> @v1i64(<2 x i64> %a) nounwind {
+; CHECK-LABEL: v1i64:
+; CHECK: ext.16b v0, v0, v0, #8
+; CHECK: ret
+ %ret = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
+ ret <1 x i64> %ret
+}
+
+define <2 x float> @v2f32(<4 x float> %a) nounwind {
+; CHECK-LABEL: v2f32:
+; CHECK: ext.16b v0, v0, v0, #8
+; CHECK: ret
+ %ret = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> <i32 2, i32 3>
+ ret <2 x float> %ret
+}
+
+define <1 x double> @v1f64(<2 x double> %a) nounwind {
+; CHECK-LABEL: v1f64:
+; CHECK: ext.16b v0, v0, v0, #8
+; CHECK: ret
+ %ret = shufflevector <2 x double> %a, <2 x double> %a, <1 x i32> <i32 1>
+ ret <1 x double> %ret
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-addr-offset.ll b/test/CodeGen/AArch64/arm64-fast-isel-addr-offset.ll
new file mode 100644
index 0000000..ebd847e
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel-addr-offset.ll
@@ -0,0 +1,47 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+@sortlist = common global [5001 x i32] zeroinitializer, align 16
+@sortlist2 = common global [5001 x i64] zeroinitializer, align 16
+
+; Load an address with an offset larget then LDR imm can handle
+define i32 @foo() nounwind {
+entry:
+; CHECK: @foo
+; CHECK: adrp x[[REG:[0-9]+]], _sortlist@GOTPAGE
+; CHECK: ldr x[[REG1:[0-9]+]], [x[[REG]], _sortlist@GOTPAGEOFF]
+; CHECK: movz x[[REG2:[0-9]+]], #0x4e20
+; CHECK: add x[[REG3:[0-9]+]], x[[REG1]], x[[REG2]]
+; CHECK: ldr w0, [x[[REG3]]]
+; CHECK: ret
+ %0 = load i32* getelementptr inbounds ([5001 x i32]* @sortlist, i32 0, i64 5000), align 4
+ ret i32 %0
+}
+
+define i64 @foo2() nounwind {
+entry:
+; CHECK: @foo2
+; CHECK: adrp x[[REG:[0-9]+]], _sortlist2@GOTPAGE
+; CHECK: ldr x[[REG1:[0-9]+]], [x[[REG]], _sortlist2@GOTPAGEOFF]
+; CHECK: movz x[[REG2:[0-9]+]], #0x9c40
+; CHECK: add x[[REG3:[0-9]+]], x[[REG1]], x[[REG2]]
+; CHECK: ldr x0, [x[[REG3]]]
+; CHECK: ret
+ %0 = load i64* getelementptr inbounds ([5001 x i64]* @sortlist2, i32 0, i64 5000), align 4
+ ret i64 %0
+}
+
+; Load an address with a ridiculously large offset.
+; rdar://12505553
+@pd2 = common global i8* null, align 8
+
+define signext i8 @foo3() nounwind ssp {
+entry:
+; CHECK: @foo3
+; CHECK: movz x[[REG:[0-9]+]], #0xb3a, lsl #32
+; CHECK: movk x[[REG]], #0x73ce, lsl #16
+; CHECK: movk x[[REG]], #0x2ff2
+ %0 = load i8** @pd2, align 8
+ %arrayidx = getelementptr inbounds i8* %0, i64 12345678901234
+ %1 = load i8* %arrayidx, align 1
+ ret i8 %1
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-alloca.ll b/test/CodeGen/AArch64/arm64-fast-isel-alloca.ll
new file mode 100644
index 0000000..1706e9e
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel-alloca.ll
@@ -0,0 +1,25 @@
+; This test should cause the TargetMaterializeAlloca to be invoked
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+%struct.S1Ty = type { i64 }
+%struct.S2Ty = type { %struct.S1Ty, %struct.S1Ty }
+
+define void @takeS1(%struct.S1Ty* %V) nounwind {
+entry:
+ %V.addr = alloca %struct.S1Ty*, align 8
+ store %struct.S1Ty* %V, %struct.S1Ty** %V.addr, align 8
+ ret void
+}
+
+define void @main() nounwind {
+entry:
+; CHECK: main
+; CHECK: mov x29, sp
+; CHECK: mov x[[REG:[0-9]+]], sp
+; CHECK-NEXT: orr x[[REG1:[0-9]+]], xzr, #0x8
+; CHECK-NEXT: add x0, x[[REG]], x[[REG1]]
+ %E = alloca %struct.S2Ty, align 4
+ %B = getelementptr inbounds %struct.S2Ty* %E, i32 0, i32 1
+ call void @takeS1(%struct.S1Ty* %B)
+ ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-br.ll b/test/CodeGen/AArch64/arm64-fast-isel-br.ll
new file mode 100644
index 0000000..37a8295
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel-br.ll
@@ -0,0 +1,155 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin -mcpu=cyclone | FileCheck %s
+
+define void @branch1() nounwind uwtable ssp {
+ %x = alloca i32, align 4
+ store i32 0, i32* %x, align 4
+ %1 = load i32* %x, align 4
+ %2 = icmp ne i32 %1, 0
+ br i1 %2, label %3, label %4
+
+; <label>:3 ; preds = %0
+ br label %4
+
+; <label>:4 ; preds = %3, %0
+ ret void
+}
+
+define void @branch2() nounwind uwtable ssp {
+ %1 = alloca i32, align 4
+ %x = alloca i32, align 4
+ %y = alloca i32, align 4
+ %z = alloca i32, align 4
+ store i32 0, i32* %1
+ store i32 1, i32* %y, align 4
+ store i32 1, i32* %x, align 4
+ store i32 0, i32* %z, align 4
+ %2 = load i32* %x, align 4
+ %3 = icmp ne i32 %2, 0
+ br i1 %3, label %4, label %5
+
+; <label>:4 ; preds = %0
+ store i32 0, i32* %1
+ br label %14
+
+; <label>:5 ; preds = %0
+ %6 = load i32* %y, align 4
+ %7 = icmp ne i32 %6, 0
+ br i1 %7, label %8, label %13
+
+; <label>:8 ; preds = %5
+ %9 = load i32* %z, align 4
+ %10 = icmp ne i32 %9, 0
+ br i1 %10, label %11, label %12
+
+; <label>:11 ; preds = %8
+ store i32 1, i32* %1
+ br label %14
+
+; <label>:12 ; preds = %8
+ store i32 0, i32* %1
+ br label %14
+
+; <label>:13 ; preds = %5
+ br label %14
+
+; <label>:14 ; preds = %4, %11, %12, %13
+ %15 = load i32* %1
+ ret void
+}
+
+define void @true_() nounwind uwtable ssp {
+; CHECK: @true_
+; CHECK: b LBB2_1
+ br i1 true, label %1, label %2
+
+; <label>:1
+; CHECK: LBB2_1
+ br label %2
+
+; <label>:2
+ ret void
+}
+
+define void @false_() nounwind uwtable ssp {
+; CHECK: @false_
+; CHECK: b LBB3_2
+ br i1 false, label %1, label %2
+
+; <label>:1
+ br label %2
+
+; <label>:2
+; CHECK: LBB3_2
+ ret void
+}
+
+define zeroext i8 @trunc_(i8 zeroext %a, i16 zeroext %b, i32 %c, i64 %d) {
+entry:
+ %a.addr = alloca i8, align 1
+ %b.addr = alloca i16, align 2
+ %c.addr = alloca i32, align 4
+ %d.addr = alloca i64, align 8
+ store i8 %a, i8* %a.addr, align 1
+ store i16 %b, i16* %b.addr, align 2
+ store i32 %c, i32* %c.addr, align 4
+ store i64 %d, i64* %d.addr, align 8
+ %0 = load i16* %b.addr, align 2
+; CHECK: and w0, w0, #0x1
+; CHECK: subs w0, w0, #0
+; CHECK: b.eq LBB4_2
+ %conv = trunc i16 %0 to i1
+ br i1 %conv, label %if.then, label %if.end
+
+if.then: ; preds = %entry
+ call void @foo1()
+ br label %if.end
+
+if.end: ; preds = %if.then, %entry
+ %1 = load i32* %c.addr, align 4
+; CHECK: and w[[REG:[0-9]+]], w{{[0-9]+}}, #0x1
+; CHECK: subs w{{[0-9]+}}, w[[REG]], #0
+; CHECK: b.eq LBB4_4
+ %conv1 = trunc i32 %1 to i1
+ br i1 %conv1, label %if.then3, label %if.end4
+
+if.then3: ; preds = %if.end
+ call void @foo1()
+ br label %if.end4
+
+if.end4: ; preds = %if.then3, %if.end
+ %2 = load i64* %d.addr, align 8
+; CHECK: subs w{{[0-9]+}}, w{{[0-9]+}}, #0
+; CHECK: b.eq LBB4_6
+ %conv5 = trunc i64 %2 to i1
+ br i1 %conv5, label %if.then7, label %if.end8
+
+if.then7: ; preds = %if.end4
+ call void @foo1()
+ br label %if.end8
+
+if.end8: ; preds = %if.then7, %if.end4
+ %3 = load i8* %a.addr, align 1
+ ret i8 %3
+}
+
+declare void @foo1()
+
+; rdar://15174028
+define i32 @trunc64(i64 %foo) nounwind {
+; CHECK: trunc64
+; CHECK: orr [[REG:x[0-9]+]], xzr, #0x1
+; CHECK: and [[REG2:x[0-9]+]], x0, [[REG]]
+; CHECK: mov x[[REG3:[0-9]+]], [[REG2]]
+; CHECK: and [[REG4:w[0-9]+]], w[[REG3]], #0x1
+; CHECK: subs {{w[0-9]+}}, [[REG4]], #0
+; CHECK: b.eq LBB5_2
+ %a = and i64 %foo, 1
+ %b = trunc i64 %a to i1
+ br i1 %b, label %if.then, label %if.else
+
+if.then:
+ ret i32 1
+
+if.else:
+ ret i32 0
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-call.ll b/test/CodeGen/AArch64/arm64-fast-isel-call.ll
new file mode 100644
index 0000000..8d756ae
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel-call.ll
@@ -0,0 +1,100 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64_be-linux-gnu | FileCheck %s --check-prefix=CHECK-BE
+
+define void @call0() nounwind {
+entry:
+ ret void
+}
+
+define void @foo0() nounwind {
+entry:
+; CHECK: foo0
+; CHECK: bl _call0
+ call void @call0()
+ ret void
+}
+
+define i32 @call1(i32 %a) nounwind {
+entry:
+ %a.addr = alloca i32, align 4
+ store i32 %a, i32* %a.addr, align 4
+ %tmp = load i32* %a.addr, align 4
+ ret i32 %tmp
+}
+
+define i32 @foo1(i32 %a) nounwind {
+entry:
+; CHECK: foo1
+; CHECK: stur w0, [x29, #-4]
+; CHECK-NEXT: ldur w0, [x29, #-4]
+; CHECK-NEXT: bl _call1
+ %a.addr = alloca i32, align 4
+ store i32 %a, i32* %a.addr, align 4
+ %tmp = load i32* %a.addr, align 4
+ %call = call i32 @call1(i32 %tmp)
+ ret i32 %call
+}
+
+define i32 @sext_(i8 %a, i16 %b) nounwind {
+entry:
+; CHECK: @sext_
+; CHECK: sxtb w0, w0
+; CHECK: sxth w1, w1
+; CHECK: bl _foo_sext_
+ call void @foo_sext_(i8 signext %a, i16 signext %b)
+ ret i32 0
+}
+
+declare void @foo_sext_(i8 %a, i16 %b)
+
+define i32 @zext_(i8 %a, i16 %b) nounwind {
+entry:
+; CHECK: @zext_
+; CHECK: uxtb w0, w0
+; CHECK: uxth w1, w1
+ call void @foo_zext_(i8 zeroext %a, i16 zeroext %b)
+ ret i32 0
+}
+
+declare void @foo_zext_(i8 %a, i16 %b)
+
+define i32 @t1(i32 %argc, i8** nocapture %argv) {
+entry:
+; CHECK: @t1
+; The last parameter will be passed on stack via i8.
+; CHECK: strb w{{[0-9]+}}, [sp]
+; CHECK-NEXT: bl _bar
+ %call = call i32 @bar(i8 zeroext 0, i8 zeroext -8, i8 zeroext -69, i8 zeroext 28, i8 zeroext 40, i8 zeroext -70, i8 zeroext 28, i8 zeroext 39, i8 zeroext -41)
+ ret i32 0
+}
+
+declare i32 @bar(i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext)
+
+; Test materialization of integers. Target-independent selector handles this.
+define i32 @t2() {
+entry:
+; CHECK: @t2
+; CHECK: movz x0, #0
+; CHECK: orr w1, wzr, #0xfffffff8
+; CHECK: orr w[[REG:[0-9]+]], wzr, #0x3ff
+; CHECK: orr w[[REG2:[0-9]+]], wzr, #0x2
+; CHECK: movz w[[REG3:[0-9]+]], #0
+; CHECK: orr w[[REG4:[0-9]+]], wzr, #0x1
+; CHECK: uxth w2, w[[REG]]
+; CHECK: sxtb w3, w[[REG2]]
+; CHECK: and w4, w[[REG3]], #0x1
+; CHECK: and w5, w[[REG4]], #0x1
+; CHECK: bl _func2
+ %call = call i32 @func2(i64 zeroext 0, i32 signext -8, i16 zeroext 1023, i8 signext -254, i1 zeroext 0, i1 zeroext 1)
+ ret i32 0
+}
+
+declare i32 @func2(i64 zeroext, i32 signext, i16 zeroext, i8 signext, i1 zeroext, i1 zeroext)
+
+declare void @callee_b0f(i8 %bp10, i8 %bp11, i8 %bp12, i8 %bp13, i8 %bp14, i8 %bp15, i8 %bp17, i8 %bp18, i8 %bp19)
+define void @caller_b1f() {
+entry:
+ ; CHECK-BE: strb w{{.*}}, [sp, #7]
+ call void @callee_b0f(i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 42)
+ ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-conversion.ll b/test/CodeGen/AArch64/arm64-fast-isel-conversion.ll
new file mode 100644
index 0000000..c5417de
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel-conversion.ll
@@ -0,0 +1,442 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin -mcpu=cyclone | FileCheck %s
+
+;; Test various conversions.
+define zeroext i32 @trunc_(i8 zeroext %a, i16 zeroext %b, i32 %c, i64 %d) nounwind ssp {
+entry:
+; CHECK: trunc_
+; CHECK: sub sp, sp, #16
+; CHECK: strb w0, [sp, #15]
+; CHECK: strh w1, [sp, #12]
+; CHECK: str w2, [sp, #8]
+; CHECK: str x3, [sp]
+; CHECK: ldr x3, [sp]
+; CHECK: mov x0, x3
+; CHECK: str w0, [sp, #8]
+; CHECK: ldr w0, [sp, #8]
+; CHECK: strh w0, [sp, #12]
+; CHECK: ldrh w0, [sp, #12]
+; CHECK: strb w0, [sp, #15]
+; CHECK: ldrb w0, [sp, #15]
+; CHECK: uxtb w0, w0
+; CHECK: add sp, sp, #16
+; CHECK: ret
+ %a.addr = alloca i8, align 1
+ %b.addr = alloca i16, align 2
+ %c.addr = alloca i32, align 4
+ %d.addr = alloca i64, align 8
+ store i8 %a, i8* %a.addr, align 1
+ store i16 %b, i16* %b.addr, align 2
+ store i32 %c, i32* %c.addr, align 4
+ store i64 %d, i64* %d.addr, align 8
+ %tmp = load i64* %d.addr, align 8
+ %conv = trunc i64 %tmp to i32
+ store i32 %conv, i32* %c.addr, align 4
+ %tmp1 = load i32* %c.addr, align 4
+ %conv2 = trunc i32 %tmp1 to i16
+ store i16 %conv2, i16* %b.addr, align 2
+ %tmp3 = load i16* %b.addr, align 2
+ %conv4 = trunc i16 %tmp3 to i8
+ store i8 %conv4, i8* %a.addr, align 1
+ %tmp5 = load i8* %a.addr, align 1
+ %conv6 = zext i8 %tmp5 to i32
+ ret i32 %conv6
+}
+
+define i64 @zext_(i8 zeroext %a, i16 zeroext %b, i32 %c, i64 %d) nounwind ssp {
+entry:
+; CHECK: zext_
+; CHECK: sub sp, sp, #16
+; CHECK: strb w0, [sp, #15]
+; CHECK: strh w1, [sp, #12]
+; CHECK: str w2, [sp, #8]
+; CHECK: str x3, [sp]
+; CHECK: ldrb w0, [sp, #15]
+; CHECK: uxtb w0, w0
+; CHECK: strh w0, [sp, #12]
+; CHECK: ldrh w0, [sp, #12]
+; CHECK: uxth w0, w0
+; CHECK: str w0, [sp, #8]
+; CHECK: ldr w0, [sp, #8]
+; CHECK: mov x3, x0
+; CHECK: ubfx x3, x3, #0, #32
+; CHECK: str x3, [sp]
+; CHECK: ldr x0, [sp]
+; CHECK: ret
+ %a.addr = alloca i8, align 1
+ %b.addr = alloca i16, align 2
+ %c.addr = alloca i32, align 4
+ %d.addr = alloca i64, align 8
+ store i8 %a, i8* %a.addr, align 1
+ store i16 %b, i16* %b.addr, align 2
+ store i32 %c, i32* %c.addr, align 4
+ store i64 %d, i64* %d.addr, align 8
+ %tmp = load i8* %a.addr, align 1
+ %conv = zext i8 %tmp to i16
+ store i16 %conv, i16* %b.addr, align 2
+ %tmp1 = load i16* %b.addr, align 2
+ %conv2 = zext i16 %tmp1 to i32
+ store i32 %conv2, i32* %c.addr, align 4
+ %tmp3 = load i32* %c.addr, align 4
+ %conv4 = zext i32 %tmp3 to i64
+ store i64 %conv4, i64* %d.addr, align 8
+ %tmp5 = load i64* %d.addr, align 8
+ ret i64 %tmp5
+}
+
+define i32 @zext_i1_i32(i1 zeroext %a) nounwind ssp {
+entry:
+; CHECK: @zext_i1_i32
+; CHECK: and w0, w0, #0x1
+ %conv = zext i1 %a to i32
+ ret i32 %conv;
+}
+
+define i64 @zext_i1_i64(i1 zeroext %a) nounwind ssp {
+entry:
+; CHECK: @zext_i1_i64
+; CHECK: and w0, w0, #0x1
+ %conv = zext i1 %a to i64
+ ret i64 %conv;
+}
+
+define i64 @sext_(i8 signext %a, i16 signext %b, i32 %c, i64 %d) nounwind ssp {
+entry:
+; CHECK: sext_
+; CHECK: sub sp, sp, #16
+; CHECK: strb w0, [sp, #15]
+; CHECK: strh w1, [sp, #12]
+; CHECK: str w2, [sp, #8]
+; CHECK: str x3, [sp]
+; CHECK: ldrb w0, [sp, #15]
+; CHECK: sxtb w0, w0
+; CHECK: strh w0, [sp, #12]
+; CHECK: ldrh w0, [sp, #12]
+; CHECK: sxth w0, w0
+; CHECK: str w0, [sp, #8]
+; CHECK: ldr w0, [sp, #8]
+; CHECK: mov x3, x0
+; CHECK: sxtw x3, w3
+; CHECK: str x3, [sp]
+; CHECK: ldr x0, [sp]
+; CHECK: ret
+ %a.addr = alloca i8, align 1
+ %b.addr = alloca i16, align 2
+ %c.addr = alloca i32, align 4
+ %d.addr = alloca i64, align 8
+ store i8 %a, i8* %a.addr, align 1
+ store i16 %b, i16* %b.addr, align 2
+ store i32 %c, i32* %c.addr, align 4
+ store i64 %d, i64* %d.addr, align 8
+ %tmp = load i8* %a.addr, align 1
+ %conv = sext i8 %tmp to i16
+ store i16 %conv, i16* %b.addr, align 2
+ %tmp1 = load i16* %b.addr, align 2
+ %conv2 = sext i16 %tmp1 to i32
+ store i32 %conv2, i32* %c.addr, align 4
+ %tmp3 = load i32* %c.addr, align 4
+ %conv4 = sext i32 %tmp3 to i64
+ store i64 %conv4, i64* %d.addr, align 8
+ %tmp5 = load i64* %d.addr, align 8
+ ret i64 %tmp5
+}
+
+; Test sext i8 to i64
+
+define zeroext i64 @sext_i8_i64(i8 zeroext %in) {
+; CHECK-LABEL: sext_i8_i64:
+; CHECK: mov x[[TMP:[0-9]+]], x0
+; CHECK: sxtb x0, w[[TMP]]
+ %big = sext i8 %in to i64
+ ret i64 %big
+}
+
+define zeroext i64 @sext_i16_i64(i16 zeroext %in) {
+; CHECK-LABEL: sext_i16_i64:
+; CHECK: mov x[[TMP:[0-9]+]], x0
+; CHECK: sxth x0, w[[TMP]]
+ %big = sext i16 %in to i64
+ ret i64 %big
+}
+
+; Test sext i1 to i32
+define i32 @sext_i1_i32(i1 signext %a) nounwind ssp {
+entry:
+; CHECK: sext_i1_i32
+; CHECK: sbfx w0, w0, #0, #1
+ %conv = sext i1 %a to i32
+ ret i32 %conv
+}
+
+; Test sext i1 to i16
+define signext i16 @sext_i1_i16(i1 %a) nounwind ssp {
+entry:
+; CHECK: sext_i1_i16
+; CHECK: sbfx w0, w0, #0, #1
+ %conv = sext i1 %a to i16
+ ret i16 %conv
+}
+
+; Test sext i1 to i8
+define signext i8 @sext_i1_i8(i1 %a) nounwind ssp {
+entry:
+; CHECK: sext_i1_i8
+; CHECK: sbfx w0, w0, #0, #1
+ %conv = sext i1 %a to i8
+ ret i8 %conv
+}
+
+; Test fpext
+define double @fpext_(float %a) nounwind ssp {
+entry:
+; CHECK: fpext_
+; CHECK: fcvt d0, s0
+ %conv = fpext float %a to double
+ ret double %conv
+}
+
+; Test fptrunc
+define float @fptrunc_(double %a) nounwind ssp {
+entry:
+; CHECK: fptrunc_
+; CHECK: fcvt s0, d0
+ %conv = fptrunc double %a to float
+ ret float %conv
+}
+
+; Test fptosi
+define i32 @fptosi_ws(float %a) nounwind ssp {
+entry:
+; CHECK: fptosi_ws
+; CHECK: fcvtzs w0, s0
+ %conv = fptosi float %a to i32
+ ret i32 %conv
+}
+
+; Test fptosi
+define i32 @fptosi_wd(double %a) nounwind ssp {
+entry:
+; CHECK: fptosi_wd
+; CHECK: fcvtzs w0, d0
+ %conv = fptosi double %a to i32
+ ret i32 %conv
+}
+
+; Test fptoui
+define i32 @fptoui_ws(float %a) nounwind ssp {
+entry:
+; CHECK: fptoui_ws
+; CHECK: fcvtzu w0, s0
+ %conv = fptoui float %a to i32
+ ret i32 %conv
+}
+
+; Test fptoui
+define i32 @fptoui_wd(double %a) nounwind ssp {
+entry:
+; CHECK: fptoui_wd
+; CHECK: fcvtzu w0, d0
+ %conv = fptoui double %a to i32
+ ret i32 %conv
+}
+
+; Test sitofp
+define float @sitofp_sw_i1(i1 %a) nounwind ssp {
+entry:
+; CHECK: sitofp_sw_i1
+; CHECK: sbfx w0, w0, #0, #1
+; CHECK: scvtf s0, w0
+ %conv = sitofp i1 %a to float
+ ret float %conv
+}
+
+; Test sitofp
+define float @sitofp_sw_i8(i8 %a) nounwind ssp {
+entry:
+; CHECK: sitofp_sw_i8
+; CHECK: sxtb w0, w0
+; CHECK: scvtf s0, w0
+ %conv = sitofp i8 %a to float
+ ret float %conv
+}
+
+; Test sitofp
+define float @sitofp_sw_i16(i16 %a) nounwind ssp {
+entry:
+; CHECK: sitofp_sw_i16
+; CHECK: sxth w0, w0
+; CHECK: scvtf s0, w0
+ %conv = sitofp i16 %a to float
+ ret float %conv
+}
+
+; Test sitofp
+define float @sitofp_sw(i32 %a) nounwind ssp {
+entry:
+; CHECK: sitofp_sw
+; CHECK: scvtf s0, w0
+ %conv = sitofp i32 %a to float
+ ret float %conv
+}
+
+; Test sitofp
+define float @sitofp_sx(i64 %a) nounwind ssp {
+entry:
+; CHECK: sitofp_sx
+; CHECK: scvtf s0, x0
+ %conv = sitofp i64 %a to float
+ ret float %conv
+}
+
+; Test sitofp
+define double @sitofp_dw(i32 %a) nounwind ssp {
+entry:
+; CHECK: sitofp_dw
+; CHECK: scvtf d0, w0
+ %conv = sitofp i32 %a to double
+ ret double %conv
+}
+
+; Test sitofp
+define double @sitofp_dx(i64 %a) nounwind ssp {
+entry:
+; CHECK: sitofp_dx
+; CHECK: scvtf d0, x0
+ %conv = sitofp i64 %a to double
+ ret double %conv
+}
+
+; Test uitofp
+define float @uitofp_sw_i1(i1 %a) nounwind ssp {
+entry:
+; CHECK: uitofp_sw_i1
+; CHECK: and w0, w0, #0x1
+; CHECK: ucvtf s0, w0
+ %conv = uitofp i1 %a to float
+ ret float %conv
+}
+
+; Test uitofp
+define float @uitofp_sw_i8(i8 %a) nounwind ssp {
+entry:
+; CHECK: uitofp_sw_i8
+; CHECK: uxtb w0, w0
+; CHECK: ucvtf s0, w0
+ %conv = uitofp i8 %a to float
+ ret float %conv
+}
+
+; Test uitofp
+define float @uitofp_sw_i16(i16 %a) nounwind ssp {
+entry:
+; CHECK: uitofp_sw_i16
+; CHECK: uxth w0, w0
+; CHECK: ucvtf s0, w0
+ %conv = uitofp i16 %a to float
+ ret float %conv
+}
+
+; Test uitofp
+define float @uitofp_sw(i32 %a) nounwind ssp {
+entry:
+; CHECK: uitofp_sw
+; CHECK: ucvtf s0, w0
+ %conv = uitofp i32 %a to float
+ ret float %conv
+}
+
+; Test uitofp
+define float @uitofp_sx(i64 %a) nounwind ssp {
+entry:
+; CHECK: uitofp_sx
+; CHECK: ucvtf s0, x0
+ %conv = uitofp i64 %a to float
+ ret float %conv
+}
+
+; Test uitofp
+define double @uitofp_dw(i32 %a) nounwind ssp {
+entry:
+; CHECK: uitofp_dw
+; CHECK: ucvtf d0, w0
+ %conv = uitofp i32 %a to double
+ ret double %conv
+}
+
+; Test uitofp
+define double @uitofp_dx(i64 %a) nounwind ssp {
+entry:
+; CHECK: uitofp_dx
+; CHECK: ucvtf d0, x0
+ %conv = uitofp i64 %a to double
+ ret double %conv
+}
+
+define i32 @i64_trunc_i32(i64 %a) nounwind ssp {
+entry:
+; CHECK: i64_trunc_i32
+; CHECK: mov x1, x0
+ %conv = trunc i64 %a to i32
+ ret i32 %conv
+}
+
+define zeroext i16 @i64_trunc_i16(i64 %a) nounwind ssp {
+entry:
+; CHECK: i64_trunc_i16
+; CHECK: mov x[[REG:[0-9]+]], x0
+; CHECK: and [[REG2:w[0-9]+]], w[[REG]], #0xffff
+; CHECK: uxth w0, [[REG2]]
+ %conv = trunc i64 %a to i16
+ ret i16 %conv
+}
+
+define zeroext i8 @i64_trunc_i8(i64 %a) nounwind ssp {
+entry:
+; CHECK: i64_trunc_i8
+; CHECK: mov x[[REG:[0-9]+]], x0
+; CHECK: and [[REG2:w[0-9]+]], w[[REG]], #0xff
+; CHECK: uxtb w0, [[REG2]]
+ %conv = trunc i64 %a to i8
+ ret i8 %conv
+}
+
+define zeroext i1 @i64_trunc_i1(i64 %a) nounwind ssp {
+entry:
+; CHECK: i64_trunc_i1
+; CHECK: mov x[[REG:[0-9]+]], x0
+; CHECK: and [[REG2:w[0-9]+]], w[[REG]], #0x1
+; CHECK: and w0, [[REG2]], #0x1
+ %conv = trunc i64 %a to i1
+ ret i1 %conv
+}
+
+; rdar://15101939
+define void @stack_trunc() nounwind {
+; CHECK: stack_trunc
+; CHECK: sub sp, sp, #16
+; CHECK: ldr [[REG:x[0-9]+]], [sp]
+; CHECK: mov x[[REG2:[0-9]+]], [[REG]]
+; CHECK: and [[REG3:w[0-9]+]], w[[REG2]], #0xff
+; CHECK: strb [[REG3]], [sp, #15]
+; CHECK: add sp, sp, #16
+ %a = alloca i8, align 1
+ %b = alloca i64, align 8
+ %c = load i64* %b, align 8
+ %d = trunc i64 %c to i8
+ store i8 %d, i8* %a, align 1
+ ret void
+}
+
+define zeroext i64 @zext_i8_i64(i8 zeroext %in) {
+; CHECK-LABEL: zext_i8_i64:
+; CHECK: mov x[[TMP:[0-9]+]], x0
+; CHECK: ubfx x0, x[[TMP]], #0, #8
+ %big = zext i8 %in to i64
+ ret i64 %big
+}
+define zeroext i64 @zext_i16_i64(i16 zeroext %in) {
+; CHECK-LABEL: zext_i16_i64:
+; CHECK: mov x[[TMP:[0-9]+]], x0
+; CHECK: ubfx x0, x[[TMP]], #0, #16
+ %big = zext i16 %in to i64
+ ret i64 %big
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-fcmp.ll b/test/CodeGen/AArch64/arm64-fast-isel-fcmp.ll
new file mode 100644
index 0000000..f030596
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel-fcmp.ll
@@ -0,0 +1,146 @@
+; RUN: llc < %s -O0 -fast-isel-abort -verify-machineinstrs -mtriple=arm64-apple-darwin | FileCheck %s
+
+define zeroext i1 @fcmp_float1(float %a) nounwind ssp {
+entry:
+; CHECK-LABEL: @fcmp_float1
+; CHECK: fcmp s0, #0.0
+; CHECK: cset w{{[0-9]+}}, ne
+ %cmp = fcmp une float %a, 0.000000e+00
+ ret i1 %cmp
+}
+
+define zeroext i1 @fcmp_float2(float %a, float %b) nounwind ssp {
+entry:
+; CHECK-LABEL: @fcmp_float2
+; CHECK: fcmp s0, s1
+; CHECK: cset w{{[0-9]+}}, ne
+ %cmp = fcmp une float %a, %b
+ ret i1 %cmp
+}
+
+define zeroext i1 @fcmp_double1(double %a) nounwind ssp {
+entry:
+; CHECK-LABEL: @fcmp_double1
+; CHECK: fcmp d0, #0.0
+; CHECK: cset w{{[0-9]+}}, ne
+ %cmp = fcmp une double %a, 0.000000e+00
+ ret i1 %cmp
+}
+
+define zeroext i1 @fcmp_double2(double %a, double %b) nounwind ssp {
+entry:
+; CHECK-LABEL: @fcmp_double2
+; CHECK: fcmp d0, d1
+; CHECK: cset w{{[0-9]+}}, ne
+ %cmp = fcmp une double %a, %b
+ ret i1 %cmp
+}
+
+; Check each fcmp condition
+define float @fcmp_oeq(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_oeq
+; CHECK: fcmp s0, s1
+; CHECK: cset w{{[0-9]+}}, eq
+ %cmp = fcmp oeq float %a, %b
+ %conv = uitofp i1 %cmp to float
+ ret float %conv
+}
+
+define float @fcmp_ogt(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_ogt
+; CHECK: fcmp s0, s1
+; CHECK: cset w{{[0-9]+}}, gt
+ %cmp = fcmp ogt float %a, %b
+ %conv = uitofp i1 %cmp to float
+ ret float %conv
+}
+
+define float @fcmp_oge(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_oge
+; CHECK: fcmp s0, s1
+; CHECK: cset w{{[0-9]+}}, ge
+ %cmp = fcmp oge float %a, %b
+ %conv = uitofp i1 %cmp to float
+ ret float %conv
+}
+
+define float @fcmp_olt(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_olt
+; CHECK: fcmp s0, s1
+; CHECK: cset w{{[0-9]+}}, mi
+ %cmp = fcmp olt float %a, %b
+ %conv = uitofp i1 %cmp to float
+ ret float %conv
+}
+
+define float @fcmp_ole(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_ole
+; CHECK: fcmp s0, s1
+; CHECK: cset w{{[0-9]+}}, ls
+ %cmp = fcmp ole float %a, %b
+ %conv = uitofp i1 %cmp to float
+ ret float %conv
+}
+
+define float @fcmp_ord(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_ord
+; CHECK: fcmp s0, s1
+; CHECK: cset {{w[0-9]+}}, vc
+ %cmp = fcmp ord float %a, %b
+ %conv = uitofp i1 %cmp to float
+ ret float %conv
+}
+
+define float @fcmp_uno(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_uno
+; CHECK: fcmp s0, s1
+; CHECK: cset {{w[0-9]+}}, vs
+ %cmp = fcmp uno float %a, %b
+ %conv = uitofp i1 %cmp to float
+ ret float %conv
+}
+
+define float @fcmp_ugt(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_ugt
+; CHECK: fcmp s0, s1
+; CHECK: cset {{w[0-9]+}}, hi
+ %cmp = fcmp ugt float %a, %b
+ %conv = uitofp i1 %cmp to float
+ ret float %conv
+}
+
+define float @fcmp_uge(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_uge
+; CHECK: fcmp s0, s1
+; CHECK: cset {{w[0-9]+}}, pl
+ %cmp = fcmp uge float %a, %b
+ %conv = uitofp i1 %cmp to float
+ ret float %conv
+}
+
+define float @fcmp_ult(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_ult
+; CHECK: fcmp s0, s1
+; CHECK: cset {{w[0-9]+}}, lt
+ %cmp = fcmp ult float %a, %b
+ %conv = uitofp i1 %cmp to float
+ ret float %conv
+}
+
+define float @fcmp_ule(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_ule
+; CHECK: fcmp s0, s1
+; CHECK: cset {{w[0-9]+}}, le
+ %cmp = fcmp ule float %a, %b
+ %conv = uitofp i1 %cmp to float
+ ret float %conv
+}
+
+define float @fcmp_une(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_une
+; CHECK: fcmp s0, s1
+; CHECK: cset {{w[0-9]+}}, ne
+ %cmp = fcmp une float %a, %b
+ %conv = uitofp i1 %cmp to float
+ ret float %conv
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-gv.ll b/test/CodeGen/AArch64/arm64-fast-isel-gv.ll
new file mode 100644
index 0000000..dc4d895
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel-gv.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+; Test load/store of global value from global offset table.
+@seed = common global i64 0, align 8
+
+define void @Initrand() nounwind {
+entry:
+; CHECK: @Initrand
+; CHECK: adrp x[[REG:[0-9]+]], _seed@GOTPAGE
+; CHECK: ldr x[[REG2:[0-9]+]], [x[[REG]], _seed@GOTPAGEOFF]
+; CHECK: str x{{[0-9]+}}, [x[[REG2]]]
+ store i64 74755, i64* @seed, align 8
+ ret void
+}
+
+define i32 @Rand() nounwind {
+entry:
+; CHECK: @Rand
+; CHECK: adrp x[[REG:[0-9]+]], _seed@GOTPAGE
+; CHECK: ldr x[[REG2:[0-9]+]], [x[[REG]], _seed@GOTPAGEOFF]
+; CHECK: movz x[[REG3:[0-9]+]], #0x51d
+; CHECK: ldr x[[REG4:[0-9]+]], [x[[REG2]]]
+; CHECK: mul x[[REG5:[0-9]+]], x[[REG4]], x[[REG3]]
+; CHECK: movz x[[REG6:[0-9]+]], #0x3619
+; CHECK: add x[[REG7:[0-9]+]], x[[REG5]], x[[REG6]]
+; CHECK: orr x[[REG8:[0-9]+]], xzr, #0xffff
+; CHECK: and x[[REG9:[0-9]+]], x[[REG7]], x[[REG8]]
+; CHECK: str x[[REG9]], [x[[REG]]]
+; CHECK: ldr x{{[0-9]+}}, [x[[REG]]]
+ %0 = load i64* @seed, align 8
+ %mul = mul nsw i64 %0, 1309
+ %add = add nsw i64 %mul, 13849
+ %and = and i64 %add, 65535
+ store i64 %and, i64* @seed, align 8
+ %1 = load i64* @seed, align 8
+ %conv = trunc i64 %1 to i32
+ ret i32 %conv
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-icmp.ll b/test/CodeGen/AArch64/arm64-fast-isel-icmp.ll
new file mode 100644
index 0000000..971be5c
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel-icmp.ll
@@ -0,0 +1,214 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+define i32 @icmp_eq_imm(i32 %a) nounwind ssp {
+entry:
+; CHECK: icmp_eq_imm
+; CHECK: cmp w0, #31
+; CHECK: cset w0, eq
+ %cmp = icmp eq i32 %a, 31
+ %conv = zext i1 %cmp to i32
+ ret i32 %conv
+}
+
+define i32 @icmp_eq_neg_imm(i32 %a) nounwind ssp {
+entry:
+; CHECK: icmp_eq_neg_imm
+; CHECK: cmn w0, #7
+; CHECK: cset w0, eq
+ %cmp = icmp eq i32 %a, -7
+ %conv = zext i1 %cmp to i32
+ ret i32 %conv
+}
+
+define i32 @icmp_eq(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_eq
+; CHECK: cmp w0, w1
+; CHECK: cset w0, eq
+ %cmp = icmp eq i32 %a, %b
+ %conv = zext i1 %cmp to i32
+ ret i32 %conv
+}
+
+define i32 @icmp_ne(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_ne
+; CHECK: cmp w0, w1
+; CHECK: cset w0, ne
+ %cmp = icmp ne i32 %a, %b
+ %conv = zext i1 %cmp to i32
+ ret i32 %conv
+}
+
+define i32 @icmp_ugt(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_ugt
+; CHECK: cmp w0, w1
+; CHECK: cset w0, hi
+ %cmp = icmp ugt i32 %a, %b
+ %conv = zext i1 %cmp to i32
+ ret i32 %conv
+}
+
+define i32 @icmp_uge(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_uge
+; CHECK: cmp w0, w1
+; CHECK: cset w0, hs
+ %cmp = icmp uge i32 %a, %b
+ %conv = zext i1 %cmp to i32
+ ret i32 %conv
+}
+
+define i32 @icmp_ult(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_ult
+; CHECK: cmp w0, w1
+; CHECK: cset w0, lo
+ %cmp = icmp ult i32 %a, %b
+ %conv = zext i1 %cmp to i32
+ ret i32 %conv
+}
+
+define i32 @icmp_ule(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_ule
+; CHECK: cmp w0, w1
+; CHECK: cset w0, ls
+ %cmp = icmp ule i32 %a, %b
+ %conv = zext i1 %cmp to i32
+ ret i32 %conv
+}
+
+define i32 @icmp_sgt(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_sgt
+; CHECK: cmp w0, w1
+; CHECK: cset w0, gt
+ %cmp = icmp sgt i32 %a, %b
+ %conv = zext i1 %cmp to i32
+ ret i32 %conv
+}
+
+define i32 @icmp_sge(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_sge
+; CHECK: cmp w0, w1
+; CHECK: cset w0, ge
+ %cmp = icmp sge i32 %a, %b
+ %conv = zext i1 %cmp to i32
+ ret i32 %conv
+}
+
+define i32 @icmp_slt(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_slt
+; CHECK: cmp w0, w1
+; CHECK: cset w0, lt
+ %cmp = icmp slt i32 %a, %b
+ %conv = zext i1 %cmp to i32
+ ret i32 %conv
+}
+
+define i32 @icmp_sle(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_sle
+; CHECK: cmp w0, w1
+; CHECK: cset w0, le
+ %cmp = icmp sle i32 %a, %b
+ %conv = zext i1 %cmp to i32
+ ret i32 %conv
+}
+
+define i32 @icmp_i64(i64 %a, i64 %b) nounwind ssp {
+entry:
+; CHECK: icmp_i64
+; CHECK: cmp x0, x1
+; CHECK: cset w{{[0-9]+}}, le
+ %cmp = icmp sle i64 %a, %b
+ %conv = zext i1 %cmp to i32
+ ret i32 %conv
+}
+
+define zeroext i1 @icmp_eq_i16(i16 %a, i16 %b) nounwind ssp {
+entry:
+; CHECK: icmp_eq_i16
+; CHECK: sxth w0, w0
+; CHECK: sxth w1, w1
+; CHECK: cmp w0, w1
+; CHECK: cset w0, eq
+ %cmp = icmp eq i16 %a, %b
+ ret i1 %cmp
+}
+
+define zeroext i1 @icmp_eq_i8(i8 %a, i8 %b) nounwind ssp {
+entry:
+; CHECK: icmp_eq_i8
+; CHECK: sxtb w0, w0
+; CHECK: sxtb w1, w1
+; CHECK: cmp w0, w1
+; CHECK: cset w0, eq
+ %cmp = icmp eq i8 %a, %b
+ ret i1 %cmp
+}
+
+define i32 @icmp_i16_unsigned(i16 %a, i16 %b) nounwind {
+entry:
+; CHECK: icmp_i16_unsigned
+; CHECK: uxth w0, w0
+; CHECK: uxth w1, w1
+; CHECK: cmp w0, w1
+; CHECK: cset w0, lo
+ %cmp = icmp ult i16 %a, %b
+ %conv2 = zext i1 %cmp to i32
+ ret i32 %conv2
+}
+
+define i32 @icmp_i8_signed(i8 %a, i8 %b) nounwind {
+entry:
+; CHECK: @icmp_i8_signed
+; CHECK: sxtb w0, w0
+; CHECK: sxtb w1, w1
+; CHECK: cmp w0, w1
+; CHECK: cset w0, gt
+ %cmp = icmp sgt i8 %a, %b
+ %conv2 = zext i1 %cmp to i32
+ ret i32 %conv2
+}
+
+
+define i32 @icmp_i16_signed_const(i16 %a) nounwind {
+entry:
+; CHECK: icmp_i16_signed_const
+; CHECK: sxth w0, w0
+; CHECK: cmn w0, #233
+; CHECK: cset w0, lt
+; CHECK: and w0, w0, #0x1
+ %cmp = icmp slt i16 %a, -233
+ %conv2 = zext i1 %cmp to i32
+ ret i32 %conv2
+}
+
+define i32 @icmp_i8_signed_const(i8 %a) nounwind {
+entry:
+; CHECK: icmp_i8_signed_const
+; CHECK: sxtb w0, w0
+; CHECK: cmp w0, #124
+; CHECK: cset w0, gt
+; CHECK: and w0, w0, #0x1
+ %cmp = icmp sgt i8 %a, 124
+ %conv2 = zext i1 %cmp to i32
+ ret i32 %conv2
+}
+
+define i32 @icmp_i1_unsigned_const(i1 %a) nounwind {
+entry:
+; CHECK: icmp_i1_unsigned_const
+; CHECK: and w0, w0, #0x1
+; CHECK: cmp w0, #0
+; CHECK: cset w0, lo
+; CHECK: and w0, w0, #0x1
+ %cmp = icmp ult i1 %a, 0
+ %conv2 = zext i1 %cmp to i32
+ ret i32 %conv2
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-indirectbr.ll b/test/CodeGen/AArch64/arm64-fast-isel-indirectbr.ll
new file mode 100644
index 0000000..70335ac
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel-indirectbr.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+@fn.table = internal global [2 x i8*] [i8* blockaddress(@fn, %ZERO), i8* blockaddress(@fn, %ONE)], align 8
+
+define i32 @fn(i32 %target) nounwind {
+entry:
+; CHECK: @fn
+ %retval = alloca i32, align 4
+ %target.addr = alloca i32, align 4
+ store i32 %target, i32* %target.addr, align 4
+ %0 = load i32* %target.addr, align 4
+ %idxprom = zext i32 %0 to i64
+ %arrayidx = getelementptr inbounds [2 x i8*]* @fn.table, i32 0, i64 %idxprom
+ %1 = load i8** %arrayidx, align 8
+ br label %indirectgoto
+
+ZERO: ; preds = %indirectgoto
+; CHECK: LBB0_1
+ store i32 0, i32* %retval
+ br label %return
+
+ONE: ; preds = %indirectgoto
+; CHECK: LBB0_2
+ store i32 1, i32* %retval
+ br label %return
+
+return: ; preds = %ONE, %ZERO
+ %2 = load i32* %retval
+ ret i32 %2
+
+indirectgoto: ; preds = %entry
+; CHECK: ldr x0, [sp]
+; CHECK: br x0
+ %indirect.goto.dest = phi i8* [ %1, %entry ]
+ indirectbr i8* %indirect.goto.dest, [label %ZERO, label %ONE]
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll b/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll
new file mode 100644
index 0000000..a3d5f6c
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll
@@ -0,0 +1,135 @@
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=arm64-apple-ios | FileCheck %s --check-prefix=ARM64
+
+@message = global [80 x i8] c"The LLVM Compiler Infrastructure\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00", align 16
+@temp = common global [80 x i8] zeroinitializer, align 16
+
+define void @t1() {
+; ARM64-LABEL: t1
+; ARM64: adrp x8, _message@PAGE
+; ARM64: add x0, x8, _message@PAGEOFF
+; ARM64: movz w9, #0
+; ARM64: movz x2, #0x50
+; ARM64: uxtb w1, w9
+; ARM64: bl _memset
+ call void @llvm.memset.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i8 0, i64 80, i32 16, i1 false)
+ ret void
+}
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
+
+define void @t2() {
+; ARM64-LABEL: t2
+; ARM64: adrp x8, _temp@GOTPAGE
+; ARM64: ldr x0, [x8, _temp@GOTPAGEOFF]
+; ARM64: adrp x8, _message@PAGE
+; ARM64: add x1, x8, _message@PAGEOFF
+; ARM64: movz x2, #0x50
+; ARM64: bl _memcpy
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 80, i32 16, i1 false)
+ ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1)
+
+define void @t3() {
+; ARM64-LABEL: t3
+; ARM64: adrp x8, _temp@GOTPAGE
+; ARM64: ldr x0, [x8, _temp@GOTPAGEOFF]
+; ARM64: adrp x8, _message@PAGE
+; ARM64: add x1, x8, _message@PAGEOFF
+; ARM64: movz x2, #0x14
+; ARM64: bl _memmove
+ call void @llvm.memmove.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 20, i32 16, i1 false)
+ ret void
+}
+
+declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1)
+
+define void @t4() {
+; ARM64-LABEL: t4
+; ARM64: adrp x8, _temp@GOTPAGE
+; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
+; ARM64: adrp x9, _message@PAGE
+; ARM64: add x9, x9, _message@PAGEOFF
+; ARM64: ldr x10, [x9]
+; ARM64: str x10, [x8]
+; ARM64: ldr x10, [x9, #8]
+; ARM64: str x10, [x8, #8]
+; ARM64: ldrb w11, [x9, #16]
+; ARM64: strb w11, [x8, #16]
+; ARM64: ret
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 17, i32 16, i1 false)
+ ret void
+}
+
+define void @t5() {
+; ARM64-LABEL: t5
+; ARM64: adrp x8, _temp@GOTPAGE
+; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
+; ARM64: adrp x9, _message@PAGE
+; ARM64: add x9, x9, _message@PAGEOFF
+; ARM64: ldr x10, [x9]
+; ARM64: str x10, [x8]
+; ARM64: ldr x10, [x9, #8]
+; ARM64: str x10, [x8, #8]
+; ARM64: ldrb w11, [x9, #16]
+; ARM64: strb w11, [x8, #16]
+; ARM64: ret
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 17, i32 8, i1 false)
+ ret void
+}
+
+define void @t6() {
+; ARM64-LABEL: t6
+; ARM64: adrp x8, _temp@GOTPAGE
+; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
+; ARM64: adrp x9, _message@PAGE
+; ARM64: add x9, x9, _message@PAGEOFF
+; ARM64: ldr w10, [x9]
+; ARM64: str w10, [x8]
+; ARM64: ldr w10, [x9, #4]
+; ARM64: str w10, [x8, #4]
+; ARM64: ldrb w10, [x9, #8]
+; ARM64: strb w10, [x8, #8]
+; ARM64: ret
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 9, i32 4, i1 false)
+ ret void
+}
+
+define void @t7() {
+; ARM64-LABEL: t7
+; ARM64: adrp x8, _temp@GOTPAGE
+; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
+; ARM64: adrp x9, _message@PAGE
+; ARM64: add x9, x9, _message@PAGEOFF
+; ARM64: ldrh w10, [x9]
+; ARM64: strh w10, [x8]
+; ARM64: ldrh w10, [x9, #2]
+; ARM64: strh w10, [x8, #2]
+; ARM64: ldrh w10, [x9, #4]
+; ARM64: strh w10, [x8, #4]
+; ARM64: ldrb w10, [x9, #6]
+; ARM64: strb w10, [x8, #6]
+; ARM64: ret
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 7, i32 2, i1 false)
+ ret void
+}
+
+define void @t8() {
+; ARM64-LABEL: t8
+; ARM64: adrp x8, _temp@GOTPAGE
+; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
+; ARM64: adrp x9, _message@PAGE
+; ARM64: add x9, x9, _message@PAGEOFF
+; ARM64: ldrb w10, [x9]
+; ARM64: strb w10, [x8]
+; ARM64: ldrb w10, [x9, #1]
+; ARM64: strb w10, [x8, #1]
+; ARM64: ldrb w10, [x9, #2]
+; ARM64: strb w10, [x8, #2]
+; ARM64: ldrb w10, [x9, #3]
+; ARM64: strb w10, [x8, #3]
+; ARM64: ret
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 4, i32 1, i1 false)
+ ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-materialize.ll b/test/CodeGen/AArch64/arm64-fast-isel-materialize.ll
new file mode 100644
index 0000000..ffac131
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel-materialize.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+; Materialize using fmov
+define void @float_(float* %value) {
+; CHECK: @float_
+; CHECK: fmov s0, #1.25000000
+ store float 1.250000e+00, float* %value, align 4
+ ret void
+}
+
+define void @double_(double* %value) {
+; CHECK: @double_
+; CHECK: fmov d0, #1.25000000
+ store double 1.250000e+00, double* %value, align 8
+ ret void
+}
+
+; Materialize from constant pool
+define float @float_cp() {
+; CHECK: @float_cp
+ ret float 0x400921FB60000000
+}
+
+define double @double_cp() {
+; CHECK: @double_cp
+ ret double 0x400921FB54442D18
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-noconvert.ll b/test/CodeGen/AArch64/arm64-fast-isel-noconvert.ll
new file mode 100644
index 0000000..483d179
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel-noconvert.ll
@@ -0,0 +1,68 @@
+; RUN: llc -mtriple=arm64-apple-ios -O0 %s -o - | FileCheck %s
+
+; Fast-isel can't do vector conversions yet, but it was emitting some highly
+; suspect UCVTFUWDri MachineInstrs.
+define <4 x float> @test_uitofp(<4 x i32> %in) {
+; CHECK-LABEL: test_uitofp:
+; CHECK: ucvtf.4s v0, v0
+
+ %res = uitofp <4 x i32> %in to <4 x float>
+ ret <4 x float> %res
+}
+
+define <2 x double> @test_sitofp(<2 x i32> %in) {
+; CHECK-LABEL: test_sitofp:
+; CHECK: sshll.2d [[EXT:v[0-9]+]], v0, #0
+; CHECK: scvtf.2d v0, [[EXT]]
+
+ %res = sitofp <2 x i32> %in to <2 x double>
+ ret <2 x double> %res
+}
+
+define <2 x i32> @test_fptoui(<2 x float> %in) {
+; CHECK-LABEL: test_fptoui:
+; CHECK: fcvtzu.2s v0, v0
+
+ %res = fptoui <2 x float> %in to <2 x i32>
+ ret <2 x i32> %res
+}
+
+define <2 x i64> @test_fptosi(<2 x double> %in) {
+; CHECK-LABEL: test_fptosi:
+; CHECK: fcvtzs.2d v0, v0
+
+ %res = fptosi <2 x double> %in to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define fp128 @uitofp_i32_fp128(i32 %a) {
+entry:
+; CHECK-LABEL: uitofp_i32_fp128
+; CHECK: bl ___floatunsitf
+ %conv = uitofp i32 %a to fp128
+ ret fp128 %conv
+}
+
+define fp128 @uitofp_i64_fp128(i64 %a) {
+entry:
+; CHECK-LABEL: uitofp_i64_fp128
+; CHECK: bl ___floatunditf
+ %conv = uitofp i64 %a to fp128
+ ret fp128 %conv
+}
+
+define i32 @uitofp_fp128_i32(fp128 %a) {
+entry:
+; CHECK-LABEL: uitofp_fp128_i32
+; CHECK: ___fixunstfsi
+ %conv = fptoui fp128 %a to i32
+ ret i32 %conv
+}
+
+define i64 @uitofp_fp128_i64(fp128 %a) {
+entry:
+; CHECK-LABEL: uitofp_fp128_i64
+; CHECK: ___fixunstfdi
+ %conv = fptoui fp128 %a to i64
+ ret i64 %conv
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-rem.ll b/test/CodeGen/AArch64/arm64-fast-isel-rem.ll
new file mode 100644
index 0000000..d5bdbaa
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel-rem.ll
@@ -0,0 +1,44 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin -print-machineinstrs=expand-isel-pseudos -o /dev/null 2> %t
+; RUN: FileCheck %s < %t --check-prefix=CHECK-SSA
+; REQUIRES: asserts
+
+; CHECK-SSA-LABEL: Machine code for function t1
+
+; CHECK-SSA: [[QUOTREG:%vreg[0-9]+]]<def> = SDIVWr
+; CHECK-SSA-NOT: [[QUOTREG]]<def> =
+; CHECK-SSA: {{%vreg[0-9]+}}<def> = MSUBWrrr [[QUOTREG]]
+
+; CHECK-SSA-LABEL: Machine code for function t2
+
+define i32 @t1(i32 %a, i32 %b) {
+; CHECK: @t1
+; CHECK: sdiv [[TMP:w[0-9]+]], w0, w1
+; CHECK: msub w0, [[TMP]], w1, w0
+ %1 = srem i32 %a, %b
+ ret i32 %1
+}
+
+define i64 @t2(i64 %a, i64 %b) {
+; CHECK: @t2
+; CHECK: sdiv [[TMP:x[0-9]+]], x0, x1
+; CHECK: msub x0, [[TMP]], x1, x0
+ %1 = srem i64 %a, %b
+ ret i64 %1
+}
+
+define i32 @t3(i32 %a, i32 %b) {
+; CHECK: @t3
+; CHECK: udiv [[TMP:w[0-9]+]], w0, w1
+; CHECK: msub w0, [[TMP]], w1, w0
+ %1 = urem i32 %a, %b
+ ret i32 %1
+}
+
+define i64 @t4(i64 %a, i64 %b) {
+; CHECK: @t4
+; CHECK: udiv [[TMP:x[0-9]+]], x0, x1
+; CHECK: msub x0, [[TMP]], x1, x0
+ %1 = urem i64 %a, %b
+ ret i64 %1
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-ret.ll b/test/CodeGen/AArch64/arm64-fast-isel-ret.ll
new file mode 100644
index 0000000..d91fd28
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel-ret.ll
@@ -0,0 +1,63 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+;; Test returns.
+define void @t0() nounwind ssp {
+entry:
+; CHECK: t0
+; CHECK: ret
+ ret void
+}
+
+define i32 @t1(i32 %a) nounwind ssp {
+entry:
+; CHECK: t1
+; CHECK: str w0, [sp, #12]
+; CHECK-NEXT: ldr w0, [sp, #12]
+; CHECK: ret
+ %a.addr = alloca i32, align 4
+ store i32 %a, i32* %a.addr, align 4
+ %tmp = load i32* %a.addr, align 4
+ ret i32 %tmp
+}
+
+define i64 @t2(i64 %a) nounwind ssp {
+entry:
+; CHECK: t2
+; CHECK: str x0, [sp, #8]
+; CHECK-NEXT: ldr x0, [sp, #8]
+; CHECK: ret
+ %a.addr = alloca i64, align 8
+ store i64 %a, i64* %a.addr, align 8
+ %tmp = load i64* %a.addr, align 8
+ ret i64 %tmp
+}
+
+define signext i16 @ret_i16(i16 signext %a) nounwind {
+entry:
+; CHECK: @ret_i16
+; CHECK: sxth w0, w0
+ %a.addr = alloca i16, align 1
+ store i16 %a, i16* %a.addr, align 1
+ %0 = load i16* %a.addr, align 1
+ ret i16 %0
+}
+
+define signext i8 @ret_i8(i8 signext %a) nounwind {
+entry:
+; CHECK: @ret_i8
+; CHECK: sxtb w0, w0
+ %a.addr = alloca i8, align 1
+ store i8 %a, i8* %a.addr, align 1
+ %0 = load i8* %a.addr, align 1
+ ret i8 %0
+}
+
+define signext i1 @ret_i1(i1 signext %a) nounwind {
+entry:
+; CHECK: @ret_i1
+; CHECK: and w0, w0, #0x1
+ %a.addr = alloca i1, align 1
+ store i1 %a, i1* %a.addr, align 1
+ %0 = load i1* %a.addr, align 1
+ ret i1 %0
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-select.ll b/test/CodeGen/AArch64/arm64-fast-isel-select.ll
new file mode 100644
index 0000000..1cc207f
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel-select.ll
@@ -0,0 +1,63 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+define i32 @t1(i32 %c) nounwind readnone {
+entry:
+; CHECK: @t1
+; CHECK: and w0, w0, #0x1
+; CHECK: subs w0, w0, #0
+; CHECK: csel w0, w{{[0-9]+}}, w{{[0-9]+}}, ne
+ %0 = icmp sgt i32 %c, 1
+ %1 = select i1 %0, i32 123, i32 357
+ ret i32 %1
+}
+
+define i64 @t2(i32 %c) nounwind readnone {
+entry:
+; CHECK: @t2
+; CHECK: and w0, w0, #0x1
+; CHECK: subs w0, w0, #0
+; CHECK: csel x0, x{{[0-9]+}}, x{{[0-9]+}}, ne
+ %0 = icmp sgt i32 %c, 1
+ %1 = select i1 %0, i64 123, i64 357
+ ret i64 %1
+}
+
+define i32 @t3(i1 %c, i32 %a, i32 %b) nounwind readnone {
+entry:
+; CHECK: @t3
+; CHECK: and w0, w0, #0x1
+; CHECK: subs w0, w0, #0
+; CHECK: csel w0, w{{[0-9]+}}, w{{[0-9]+}}, ne
+ %0 = select i1 %c, i32 %a, i32 %b
+ ret i32 %0
+}
+
+define i64 @t4(i1 %c, i64 %a, i64 %b) nounwind readnone {
+entry:
+; CHECK: @t4
+; CHECK: and w0, w0, #0x1
+; CHECK: subs w0, w0, #0
+; CHECK: csel x0, x{{[0-9]+}}, x{{[0-9]+}}, ne
+ %0 = select i1 %c, i64 %a, i64 %b
+ ret i64 %0
+}
+
+define float @t5(i1 %c, float %a, float %b) nounwind readnone {
+entry:
+; CHECK: @t5
+; CHECK: and w0, w0, #0x1
+; CHECK: subs w0, w0, #0
+; CHECK: fcsel s0, s0, s1, ne
+ %0 = select i1 %c, float %a, float %b
+ ret float %0
+}
+
+define double @t6(i1 %c, double %a, double %b) nounwind readnone {
+entry:
+; CHECK: @t6
+; CHECK: and w0, w0, #0x1
+; CHECK: subs w0, w0, #0
+; CHECK: fcsel d0, d0, d1, ne
+ %0 = select i1 %c, double %a, double %b
+ ret double %0
+}
diff --git a/test/CodeGen/AArch64/arm64-fast-isel.ll b/test/CodeGen/AArch64/arm64-fast-isel.ll
new file mode 100644
index 0000000..0194b3a
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fast-isel.ll
@@ -0,0 +1,95 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+define void @t0(i32 %a) nounwind {
+entry:
+; CHECK: t0
+; CHECK: str {{w[0-9]+}}, [sp, #12]
+; CHECK-NEXT: ldr [[REGISTER:w[0-9]+]], [sp, #12]
+; CHECK-NEXT: str [[REGISTER]], [sp, #12]
+; CHECK: ret
+ %a.addr = alloca i32, align 4
+ store i32 %a, i32* %a.addr
+ %tmp = load i32* %a.addr
+ store i32 %tmp, i32* %a.addr
+ ret void
+}
+
+define void @t1(i64 %a) nounwind {
+; CHECK: t1
+; CHECK: str {{x[0-9]+}}, [sp, #8]
+; CHECK-NEXT: ldr [[REGISTER:x[0-9]+]], [sp, #8]
+; CHECK-NEXT: str [[REGISTER]], [sp, #8]
+; CHECK: ret
+ %a.addr = alloca i64, align 4
+ store i64 %a, i64* %a.addr
+ %tmp = load i64* %a.addr
+ store i64 %tmp, i64* %a.addr
+ ret void
+}
+
+define zeroext i1 @i1(i1 %a) nounwind {
+entry:
+; CHECK: @i1
+; CHECK: and w0, w0, #0x1
+; CHECK: strb w0, [sp, #15]
+; CHECK: ldrb w0, [sp, #15]
+; CHECK: and w0, w0, #0x1
+; CHECK: and w0, w0, #0x1
+; CHECK: add sp, sp, #16
+; CHECK: ret
+ %a.addr = alloca i1, align 1
+ store i1 %a, i1* %a.addr, align 1
+ %0 = load i1* %a.addr, align 1
+ ret i1 %0
+}
+
+define i32 @t2(i32 *%ptr) nounwind {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: ldur w0, [x0, #-4]
+; CHECK: ret
+ %0 = getelementptr i32 *%ptr, i32 -1
+ %1 = load i32* %0, align 4
+ ret i32 %1
+}
+
+define i32 @t3(i32 *%ptr) nounwind {
+entry:
+; CHECK-LABEL: t3:
+; CHECK: ldur w0, [x0, #-256]
+; CHECK: ret
+ %0 = getelementptr i32 *%ptr, i32 -64
+ %1 = load i32* %0, align 4
+ ret i32 %1
+}
+
+define void @t4(i32 *%ptr) nounwind {
+entry:
+; CHECK-LABEL: t4:
+; CHECK: movz w8, #0
+; CHECK: stur w8, [x0, #-4]
+; CHECK: ret
+ %0 = getelementptr i32 *%ptr, i32 -1
+ store i32 0, i32* %0, align 4
+ ret void
+}
+
+define void @t5(i32 *%ptr) nounwind {
+entry:
+; CHECK-LABEL: t5:
+; CHECK: movz w8, #0
+; CHECK: stur w8, [x0, #-256]
+; CHECK: ret
+ %0 = getelementptr i32 *%ptr, i32 -64
+ store i32 0, i32* %0, align 4
+ ret void
+}
+
+define void @t6() nounwind {
+; CHECK: t6
+; CHECK: brk #0x1
+ tail call void @llvm.trap()
+ ret void
+}
+
+declare void @llvm.trap() nounwind
diff --git a/test/CodeGen/AArch64/arm64-fastcc-tailcall.ll b/test/CodeGen/AArch64/arm64-fastcc-tailcall.ll
new file mode 100644
index 0000000..8a744c5
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fastcc-tailcall.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define void @caller(i32* nocapture %p, i32 %a, i32 %b) nounwind optsize ssp {
+; CHECK-NOT: stp
+; CHECK: b {{_callee|callee}}
+; CHECK-NOT: ldp
+; CHECK: ret
+ %1 = icmp eq i32 %b, 0
+ br i1 %1, label %3, label %2
+
+ tail call fastcc void @callee(i32* %p, i32 %a) optsize
+ br label %3
+
+ ret void
+}
+
+define internal fastcc void @callee(i32* nocapture %p, i32 %a) nounwind optsize noinline ssp {
+ store volatile i32 %a, i32* %p, align 4, !tbaa !0
+ ret void
+}
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/AArch64/arm64-fastisel-gep-promote-before-add.ll b/test/CodeGen/AArch64/arm64-fastisel-gep-promote-before-add.ll
new file mode 100644
index 0000000..af9fe05
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fastisel-gep-promote-before-add.ll
@@ -0,0 +1,18 @@
+; fastisel should not fold add with non-pointer bitwidth
+; sext(a) + sext(b) != sext(a + b)
+; RUN: llc -mtriple=arm64-apple-darwin %s -O0 -o - | FileCheck %s
+
+define zeroext i8 @gep_promotion(i8* %ptr) nounwind uwtable ssp {
+entry:
+ %ptr.addr = alloca i8*, align 8
+ %add = add i8 64, 64 ; 0x40 + 0x40
+ %0 = load i8** %ptr.addr, align 8
+
+ ; CHECK-LABEL: _gep_promotion:
+ ; CHECK: ldrb {{[a-z][0-9]+}}, {{\[[a-z][0-9]+\]}}
+ %arrayidx = getelementptr inbounds i8* %0, i8 %add
+
+ %1 = load i8* %arrayidx, align 1
+ ret i8 %1
+}
+
diff --git a/test/CodeGen/AArch64/arm64-fcmp-opt.ll b/test/CodeGen/AArch64/arm64-fcmp-opt.ll
new file mode 100644
index 0000000..41027d4
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fcmp-opt.ll
@@ -0,0 +1,204 @@
+; RUN: llc < %s -march=arm64 -mcpu=cyclone -aarch64-neon-syntax=apple | FileCheck %s
+; rdar://10263824
+
+define i1 @fcmp_float1(float %a) nounwind ssp {
+entry:
+; CHECK-LABEL: @fcmp_float1
+; CHECK: fcmp s0, #0.0
+; CHECK: cset w0, ne
+ %cmp = fcmp une float %a, 0.000000e+00
+ ret i1 %cmp
+}
+
+define i1 @fcmp_float2(float %a, float %b) nounwind ssp {
+entry:
+; CHECK-LABEL: @fcmp_float2
+; CHECK: fcmp s0, s1
+; CHECK: cset w0, ne
+ %cmp = fcmp une float %a, %b
+ ret i1 %cmp
+}
+
+define i1 @fcmp_double1(double %a) nounwind ssp {
+entry:
+; CHECK-LABEL: @fcmp_double1
+; CHECK: fcmp d0, #0.0
+; CHECK: cset w0, ne
+ %cmp = fcmp une double %a, 0.000000e+00
+ ret i1 %cmp
+}
+
+define i1 @fcmp_double2(double %a, double %b) nounwind ssp {
+entry:
+; CHECK-LABEL: @fcmp_double2
+; CHECK: fcmp d0, d1
+; CHECK: cset w0, ne
+ %cmp = fcmp une double %a, %b
+ ret i1 %cmp
+}
+
+; Check each fcmp condition
+define float @fcmp_oeq(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_oeq
+; CHECK: fcmp s0, s1
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], eq
+
+ %cmp = fcmp oeq float %a, %b
+ %conv = uitofp i1 %cmp to float
+ ret float %conv
+}
+
+define float @fcmp_ogt(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_ogt
+; CHECK: fcmp s0, s1
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], gt
+
+ %cmp = fcmp ogt float %a, %b
+ %conv = uitofp i1 %cmp to float
+ ret float %conv
+}
+
+define float @fcmp_oge(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_oge
+; CHECK: fcmp s0, s1
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], ge
+
+ %cmp = fcmp oge float %a, %b
+ %conv = uitofp i1 %cmp to float
+ ret float %conv
+}
+
+define float @fcmp_olt(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_olt
+; CHECK: fcmp s0, s1
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], mi
+
+ %cmp = fcmp olt float %a, %b
+ %conv = uitofp i1 %cmp to float
+ ret float %conv
+}
+
+define float @fcmp_ole(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_ole
+; CHECK: fcmp s0, s1
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], ls
+
+ %cmp = fcmp ole float %a, %b
+ %conv = uitofp i1 %cmp to float
+ ret float %conv
+}
+
+define float @fcmp_ord(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_ord
+; CHECK: fcmp s0, s1
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], vc
+ %cmp = fcmp ord float %a, %b
+ %conv = uitofp i1 %cmp to float
+ ret float %conv
+}
+
+define float @fcmp_uno(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_uno
+; CHECK: fcmp s0, s1
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], vs
+ %cmp = fcmp uno float %a, %b
+ %conv = uitofp i1 %cmp to float
+ ret float %conv
+}
+
+define float @fcmp_ugt(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_ugt
+; CHECK: fcmp s0, s1
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], hi
+ %cmp = fcmp ugt float %a, %b
+ %conv = uitofp i1 %cmp to float
+ ret float %conv
+}
+
+define float @fcmp_uge(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_uge
+; CHECK: fcmp s0, s1
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], pl
+ %cmp = fcmp uge float %a, %b
+ %conv = uitofp i1 %cmp to float
+ ret float %conv
+}
+
+define float @fcmp_ult(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_ult
+; CHECK: fcmp s0, s1
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], lt
+ %cmp = fcmp ult float %a, %b
+ %conv = uitofp i1 %cmp to float
+ ret float %conv
+}
+
+define float @fcmp_ule(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_ule
+; CHECK: fcmp s0, s1
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], le
+ %cmp = fcmp ule float %a, %b
+ %conv = uitofp i1 %cmp to float
+ ret float %conv
+}
+
+define float @fcmp_une(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_une
+; CHECK: fcmp s0, s1
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel s0, s[[ONE]], s[[ZERO]], ne
+ %cmp = fcmp une float %a, %b
+ %conv = uitofp i1 %cmp to float
+ ret float %conv
+}
+
+; Possible opportunity for improvement. See comment in
+; ARM64TargetLowering::LowerSETCC()
+define float @fcmp_one(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_one
+; fcmp s0, s1
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel [[TMP:s[0-9]+]], s[[ONE]], s[[ZERO]], mi
+; CHECK: fcsel s0, s[[ONE]], [[TMP]], gt
+ %cmp = fcmp one float %a, %b
+ %conv = uitofp i1 %cmp to float
+ ret float %conv
+}
+
+; Possible opportunity for improvement. See comment in
+; ARM64TargetLowering::LowerSETCC()
+define float @fcmp_ueq(float %a, float %b) nounwind ssp {
+; CHECK-LABEL: @fcmp_ueq
+; CHECK: fcmp s0, s1
+; CHECK-DAG: movi.2d v[[ZERO:[0-9]+]], #0
+; CHECK-DAG: fmov s[[ONE:[0-9]+]], #1.0
+; CHECK: fcsel [[TMP:s[0-9]+]], s[[ONE]], s[[ZERO]], eq
+; CHECK: fcsel s0, s[[ONE]], [[TMP]], vs
+ %cmp = fcmp ueq float %a, %b
+ %conv = uitofp i1 %cmp to float
+ ret float %conv
+}
diff --git a/test/CodeGen/AArch64/arm64-fcopysign.ll b/test/CodeGen/AArch64/arm64-fcopysign.ll
new file mode 100644
index 0000000..66241df
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fcopysign.ll
@@ -0,0 +1,51 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
+
+; rdar://9332258
+
+define float @test1(float %x, float %y) nounwind {
+entry:
+; CHECK-LABEL: test1:
+; CHECK: movi.4s v2, #0x80, lsl #24
+; CHECK: bit.16b v0, v1, v2
+ %0 = tail call float @copysignf(float %x, float %y) nounwind readnone
+ ret float %0
+}
+
+define double @test2(double %x, double %y) nounwind {
+entry:
+; CHECK-LABEL: test2:
+; CHECK: movi.2d v2, #0
+; CHECK: fneg.2d v2, v2
+; CHECK: bit.16b v0, v1, v2
+ %0 = tail call double @copysign(double %x, double %y) nounwind readnone
+ ret double %0
+}
+
+; rdar://9545768
+define double @test3(double %a, float %b, float %c) nounwind {
+; CHECK-LABEL: test3:
+; CHECK: fcvt d1, s1
+; CHECK: fneg.2d v2, v{{[0-9]+}}
+; CHECK: bit.16b v0, v1, v2
+ %tmp1 = fadd float %b, %c
+ %tmp2 = fpext float %tmp1 to double
+ %tmp = tail call double @copysign( double %a, double %tmp2 ) nounwind readnone
+ ret double %tmp
+}
+
+define float @test4() nounwind {
+entry:
+; CHECK-LABEL: test4:
+; CHECK: fcvt s0, d0
+; CHECK: movi.4s v[[CONST:[0-9]+]], #0x80, lsl #24
+; CHECK: bit.16b v{{[0-9]+}}, v0, v[[CONST]]
+ %0 = tail call double (...)* @bar() nounwind
+ %1 = fptrunc double %0 to float
+ %2 = tail call float @copysignf(float 5.000000e-01, float %1) nounwind readnone
+ %3 = fadd float %1, %2
+ ret float %3
+}
+
+declare double @bar(...)
+declare double @copysign(double, double) nounwind readnone
+declare float @copysignf(float, float) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-fixed-point-scalar-cvt-dagcombine.ll b/test/CodeGen/AArch64/arm64-fixed-point-scalar-cvt-dagcombine.ll
new file mode 100644
index 0000000..e51c38b
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fixed-point-scalar-cvt-dagcombine.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+; DAGCombine to transform a conversion of an extract_vector_elt to an
+; extract_vector_elt of a conversion, which saves a round trip of copies
+; of the value to a GPR and back to and FPR.
+; rdar://11855286
+define double @foo0(<2 x i64> %a) nounwind {
+; CHECK: scvtf.2d [[REG:v[0-9]+]], v0, #9
+; CHECK-NEXT: ins.d v0[0], [[REG]][1]
+ %vecext = extractelement <2 x i64> %a, i32 1
+ %fcvt_n = tail call double @llvm.aarch64.neon.vcvtfxs2fp.f64.i64(i64 %vecext, i32 9)
+ ret double %fcvt_n
+}
+
+declare double @llvm.aarch64.neon.vcvtfxs2fp.f64.i64(i64, i32) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-fmadd.ll b/test/CodeGen/AArch64/arm64-fmadd.ll
new file mode 100644
index 0000000..c791900
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fmadd.ll
@@ -0,0 +1,92 @@
+; RUN: llc -march=arm64 < %s | FileCheck %s
+
+define float @fma32(float %a, float %b, float %c) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: fma32:
+; CHECK: fmadd s0, s0, s1, s2
+ %0 = tail call float @llvm.fma.f32(float %a, float %b, float %c)
+ ret float %0
+}
+
+define float @fnma32(float %a, float %b, float %c) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: fnma32:
+; CHECK: fnmadd s0, s0, s1, s2
+ %0 = tail call float @llvm.fma.f32(float %a, float %b, float %c)
+ %mul = fmul float %0, -1.000000e+00
+ ret float %mul
+}
+
+define float @fms32(float %a, float %b, float %c) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: fms32:
+; CHECK: fmsub s0, s0, s1, s2
+ %mul = fmul float %b, -1.000000e+00
+ %0 = tail call float @llvm.fma.f32(float %a, float %mul, float %c)
+ ret float %0
+}
+
+define float @fms32_com(float %a, float %b, float %c) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: fms32_com:
+; CHECK: fmsub s0, s1, s0, s2
+ %mul = fmul float %b, -1.000000e+00
+ %0 = tail call float @llvm.fma.f32(float %mul, float %a, float %c)
+ ret float %0
+}
+
+define float @fnms32(float %a, float %b, float %c) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: fnms32:
+; CHECK: fnmsub s0, s0, s1, s2
+ %mul = fmul float %c, -1.000000e+00
+ %0 = tail call float @llvm.fma.f32(float %a, float %b, float %mul)
+ ret float %0
+}
+
+define double @fma64(double %a, double %b, double %c) nounwind readnone ssp {
+; CHECK-LABEL: fma64:
+; CHECK: fmadd d0, d0, d1, d2
+entry:
+ %0 = tail call double @llvm.fma.f64(double %a, double %b, double %c)
+ ret double %0
+}
+
+define double @fnma64(double %a, double %b, double %c) nounwind readnone ssp {
+; CHECK-LABEL: fnma64:
+; CHECK: fnmadd d0, d0, d1, d2
+entry:
+ %0 = tail call double @llvm.fma.f64(double %a, double %b, double %c)
+ %mul = fmul double %0, -1.000000e+00
+ ret double %mul
+}
+
+define double @fms64(double %a, double %b, double %c) nounwind readnone ssp {
+; CHECK-LABEL: fms64:
+; CHECK: fmsub d0, d0, d1, d2
+entry:
+ %mul = fmul double %b, -1.000000e+00
+ %0 = tail call double @llvm.fma.f64(double %a, double %mul, double %c)
+ ret double %0
+}
+
+define double @fms64_com(double %a, double %b, double %c) nounwind readnone ssp {
+; CHECK-LABEL: fms64_com:
+; CHECK: fmsub d0, d1, d0, d2
+entry:
+ %mul = fmul double %b, -1.000000e+00
+ %0 = tail call double @llvm.fma.f64(double %mul, double %a, double %c)
+ ret double %0
+}
+
+define double @fnms64(double %a, double %b, double %c) nounwind readnone ssp {
+; CHECK-LABEL: fnms64:
+; CHECK: fnmsub d0, d0, d1, d2
+entry:
+ %mul = fmul double %c, -1.000000e+00
+ %0 = tail call double @llvm.fma.f64(double %a, double %b, double %mul)
+ ret double %0
+}
+
+declare float @llvm.fma.f32(float, float, float) nounwind readnone
+declare double @llvm.fma.f64(double, double, double) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-fmax.ll b/test/CodeGen/AArch64/arm64-fmax.ll
new file mode 100644
index 0000000..94b7454
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fmax.ll
@@ -0,0 +1,34 @@
+; RUN: llc -march=arm64 -enable-no-nans-fp-math < %s | FileCheck %s
+
+define double @test_direct(float %in) #1 {
+; CHECK-LABEL: test_direct:
+ %cmp = fcmp olt float %in, 0.000000e+00
+ %longer = fpext float %in to double
+ %val = select i1 %cmp, double 0.000000e+00, double %longer
+ ret double %val
+
+; CHECK: fmax
+}
+
+define double @test_cross(float %in) #1 {
+; CHECK-LABEL: test_cross:
+ %cmp = fcmp olt float %in, 0.000000e+00
+ %longer = fpext float %in to double
+ %val = select i1 %cmp, double %longer, double 0.000000e+00
+ ret double %val
+
+; CHECK: fmin
+}
+
+; This isn't a min or a max, but passes the first condition for swapping the
+; results. Make sure they're put back before we resort to the normal fcsel.
+define float @test_cross_fail(float %lhs, float %rhs) {
+; CHECK-LABEL: test_cross_fail:
+ %tst = fcmp une float %lhs, %rhs
+ %res = select i1 %tst, float %rhs, float %lhs
+ ret float %res
+
+ ; The register allocator would have to decide to be deliberately obtuse before
+ ; other register were used.
+; CHECK: fcsel s0, s1, s0, ne
+} \ No newline at end of file
diff --git a/test/CodeGen/AArch64/arm64-fminv.ll b/test/CodeGen/AArch64/arm64-fminv.ll
new file mode 100644
index 0000000..f4c9735
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fminv.ll
@@ -0,0 +1,101 @@
+; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
+
+define float @test_fminv_v2f32(<2 x float> %in) {
+; CHECK: test_fminv_v2f32:
+; CHECK: fminp s0, v0.2s
+ %min = call float @llvm.aarch64.neon.fminv.f32.v2f32(<2 x float> %in)
+ ret float %min
+}
+
+define float @test_fminv_v4f32(<4 x float> %in) {
+; CHECK: test_fminv_v4f32:
+; CHECK: fminv s0, v0.4s
+ %min = call float @llvm.aarch64.neon.fminv.f32.v4f32(<4 x float> %in)
+ ret float %min
+}
+
+define double @test_fminv_v2f64(<2 x double> %in) {
+; CHECK: test_fminv_v2f64:
+; CHECK: fminp d0, v0.2d
+ %min = call double @llvm.aarch64.neon.fminv.f64.v2f64(<2 x double> %in)
+ ret double %min
+}
+
+declare float @llvm.aarch64.neon.fminv.f32.v2f32(<2 x float>)
+declare float @llvm.aarch64.neon.fminv.f32.v4f32(<4 x float>)
+declare double @llvm.aarch64.neon.fminv.f64.v2f64(<2 x double>)
+
+define float @test_fmaxv_v2f32(<2 x float> %in) {
+; CHECK: test_fmaxv_v2f32:
+; CHECK: fmaxp s0, v0.2s
+ %max = call float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float> %in)
+ ret float %max
+}
+
+define float @test_fmaxv_v4f32(<4 x float> %in) {
+; CHECK: test_fmaxv_v4f32:
+; CHECK: fmaxv s0, v0.4s
+ %max = call float @llvm.aarch64.neon.fmaxv.f32.v4f32(<4 x float> %in)
+ ret float %max
+}
+
+define double @test_fmaxv_v2f64(<2 x double> %in) {
+; CHECK: test_fmaxv_v2f64:
+; CHECK: fmaxp d0, v0.2d
+ %max = call double @llvm.aarch64.neon.fmaxv.f64.v2f64(<2 x double> %in)
+ ret double %max
+}
+
+declare float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float>)
+declare float @llvm.aarch64.neon.fmaxv.f32.v4f32(<4 x float>)
+declare double @llvm.aarch64.neon.fmaxv.f64.v2f64(<2 x double>)
+
+define float @test_fminnmv_v2f32(<2 x float> %in) {
+; CHECK: test_fminnmv_v2f32:
+; CHECK: fminnmp s0, v0.2s
+ %minnm = call float @llvm.aarch64.neon.fminnmv.f32.v2f32(<2 x float> %in)
+ ret float %minnm
+}
+
+define float @test_fminnmv_v4f32(<4 x float> %in) {
+; CHECK: test_fminnmv_v4f32:
+; CHECK: fminnmv s0, v0.4s
+ %minnm = call float @llvm.aarch64.neon.fminnmv.f32.v4f32(<4 x float> %in)
+ ret float %minnm
+}
+
+define double @test_fminnmv_v2f64(<2 x double> %in) {
+; CHECK: test_fminnmv_v2f64:
+; CHECK: fminnmp d0, v0.2d
+ %minnm = call double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double> %in)
+ ret double %minnm
+}
+
+declare float @llvm.aarch64.neon.fminnmv.f32.v2f32(<2 x float>)
+declare float @llvm.aarch64.neon.fminnmv.f32.v4f32(<4 x float>)
+declare double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double>)
+
+define float @test_fmaxnmv_v2f32(<2 x float> %in) {
+; CHECK: test_fmaxnmv_v2f32:
+; CHECK: fmaxnmp s0, v0.2s
+ %maxnm = call float @llvm.aarch64.neon.fmaxnmv.f32.v2f32(<2 x float> %in)
+ ret float %maxnm
+}
+
+define float @test_fmaxnmv_v4f32(<4 x float> %in) {
+; CHECK: test_fmaxnmv_v4f32:
+; CHECK: fmaxnmv s0, v0.4s
+ %maxnm = call float @llvm.aarch64.neon.fmaxnmv.f32.v4f32(<4 x float> %in)
+ ret float %maxnm
+}
+
+define double @test_fmaxnmv_v2f64(<2 x double> %in) {
+; CHECK: test_fmaxnmv_v2f64:
+; CHECK: fmaxnmp d0, v0.2d
+ %maxnm = call double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double> %in)
+ ret double %maxnm
+}
+
+declare float @llvm.aarch64.neon.fmaxnmv.f32.v2f32(<2 x float>)
+declare float @llvm.aarch64.neon.fmaxnmv.f32.v4f32(<4 x float>)
+declare double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double>)
diff --git a/test/CodeGen/AArch64/arm64-fmuladd.ll b/test/CodeGen/AArch64/arm64-fmuladd.ll
new file mode 100644
index 0000000..6c5eeca
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fmuladd.ll
@@ -0,0 +1,88 @@
+; RUN: llc -asm-verbose=false < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define float @test_f32(float* %A, float* %B, float* %C) nounwind {
+;CHECK-LABEL: test_f32:
+;CHECK: fmadd
+;CHECK-NOT: fmadd
+ %tmp1 = load float* %A
+ %tmp2 = load float* %B
+ %tmp3 = load float* %C
+ %tmp4 = call float @llvm.fmuladd.f32(float %tmp1, float %tmp2, float %tmp3)
+ ret float %tmp4
+}
+
+define <2 x float> @test_v2f32(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
+;CHECK-LABEL: test_v2f32:
+;CHECK: fmla.2s
+;CHECK-NOT: fmla.2s
+ %tmp1 = load <2 x float>* %A
+ %tmp2 = load <2 x float>* %B
+ %tmp3 = load <2 x float>* %C
+ %tmp4 = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %tmp1, <2 x float> %tmp2, <2 x float> %tmp3)
+ ret <2 x float> %tmp4
+}
+
+define <4 x float> @test_v4f32(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
+;CHECK-LABEL: test_v4f32:
+;CHECK: fmla.4s
+;CHECK-NOT: fmla.4s
+ %tmp1 = load <4 x float>* %A
+ %tmp2 = load <4 x float>* %B
+ %tmp3 = load <4 x float>* %C
+ %tmp4 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %tmp1, <4 x float> %tmp2, <4 x float> %tmp3)
+ ret <4 x float> %tmp4
+}
+
+define <8 x float> @test_v8f32(<8 x float>* %A, <8 x float>* %B, <8 x float>* %C) nounwind {
+;CHECK-LABEL: test_v8f32:
+;CHECK: fmla.4s
+;CHECK: fmla.4s
+;CHECK-NOT: fmla.4s
+ %tmp1 = load <8 x float>* %A
+ %tmp2 = load <8 x float>* %B
+ %tmp3 = load <8 x float>* %C
+ %tmp4 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %tmp1, <8 x float> %tmp2, <8 x float> %tmp3)
+ ret <8 x float> %tmp4
+}
+
+define double @test_f64(double* %A, double* %B, double* %C) nounwind {
+;CHECK-LABEL: test_f64:
+;CHECK: fmadd
+;CHECK-NOT: fmadd
+ %tmp1 = load double* %A
+ %tmp2 = load double* %B
+ %tmp3 = load double* %C
+ %tmp4 = call double @llvm.fmuladd.f64(double %tmp1, double %tmp2, double %tmp3)
+ ret double %tmp4
+}
+
+define <2 x double> @test_v2f64(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind {
+;CHECK-LABEL: test_v2f64:
+;CHECK: fmla.2d
+;CHECK-NOT: fmla.2d
+ %tmp1 = load <2 x double>* %A
+ %tmp2 = load <2 x double>* %B
+ %tmp3 = load <2 x double>* %C
+ %tmp4 = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> %tmp1, <2 x double> %tmp2, <2 x double> %tmp3)
+ ret <2 x double> %tmp4
+}
+
+define <4 x double> @test_v4f64(<4 x double>* %A, <4 x double>* %B, <4 x double>* %C) nounwind {
+;CHECK-LABEL: test_v4f64:
+;CHECK: fmla.2d
+;CHECK: fmla.2d
+;CHECK-NOT: fmla.2d
+ %tmp1 = load <4 x double>* %A
+ %tmp2 = load <4 x double>* %B
+ %tmp3 = load <4 x double>* %C
+ %tmp4 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %tmp1, <4 x double> %tmp2, <4 x double> %tmp3)
+ ret <4 x double> %tmp4
+}
+
+declare float @llvm.fmuladd.f32(float, float, float) nounwind readnone
+declare <2 x float> @llvm.fmuladd.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+declare <8 x float> @llvm.fmuladd.v8f32(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
+declare double @llvm.fmuladd.f64(double, double, double) nounwind readnone
+declare <2 x double> @llvm.fmuladd.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-fold-address.ll b/test/CodeGen/AArch64/arm64-fold-address.ll
new file mode 100644
index 0000000..96cc3e9
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fold-address.ll
@@ -0,0 +1,79 @@
+; RUN: llc < %s -O2 -mtriple=arm64-apple-darwin | FileCheck %s
+
+%0 = type opaque
+%struct.CGRect = type { %struct.CGPoint, %struct.CGSize }
+%struct.CGPoint = type { double, double }
+%struct.CGSize = type { double, double }
+
+@"OBJC_IVAR_$_UIScreen._bounds" = external hidden global i64, section "__DATA, __objc_ivar", align 8
+
+define hidden %struct.CGRect @nofold(%0* nocapture %self, i8* nocapture %_cmd) nounwind readonly optsize ssp {
+entry:
+; CHECK-LABEL: nofold:
+; CHECK: add x[[REG:[0-9]+]], x0, x{{[0-9]+}}
+; CHECK: ldp d0, d1, [x[[REG]]]
+; CHECK: ldp d2, d3, [x[[REG]], #16]
+; CHECK: ret
+ %ivar = load i64* @"OBJC_IVAR_$_UIScreen._bounds", align 8, !invariant.load !4
+ %0 = bitcast %0* %self to i8*
+ %add.ptr = getelementptr inbounds i8* %0, i64 %ivar
+ %add.ptr10.0 = bitcast i8* %add.ptr to double*
+ %tmp11 = load double* %add.ptr10.0, align 8
+ %add.ptr.sum = add i64 %ivar, 8
+ %add.ptr10.1 = getelementptr inbounds i8* %0, i64 %add.ptr.sum
+ %1 = bitcast i8* %add.ptr10.1 to double*
+ %tmp12 = load double* %1, align 8
+ %add.ptr.sum17 = add i64 %ivar, 16
+ %add.ptr4.1 = getelementptr inbounds i8* %0, i64 %add.ptr.sum17
+ %add.ptr4.1.0 = bitcast i8* %add.ptr4.1 to double*
+ %tmp = load double* %add.ptr4.1.0, align 8
+ %add.ptr4.1.sum = add i64 %ivar, 24
+ %add.ptr4.1.1 = getelementptr inbounds i8* %0, i64 %add.ptr4.1.sum
+ %2 = bitcast i8* %add.ptr4.1.1 to double*
+ %tmp5 = load double* %2, align 8
+ %insert14 = insertvalue %struct.CGPoint undef, double %tmp11, 0
+ %insert16 = insertvalue %struct.CGPoint %insert14, double %tmp12, 1
+ %insert = insertvalue %struct.CGRect undef, %struct.CGPoint %insert16, 0
+ %insert7 = insertvalue %struct.CGSize undef, double %tmp, 0
+ %insert9 = insertvalue %struct.CGSize %insert7, double %tmp5, 1
+ %insert3 = insertvalue %struct.CGRect %insert, %struct.CGSize %insert9, 1
+ ret %struct.CGRect %insert3
+}
+
+define hidden %struct.CGRect @fold(%0* nocapture %self, i8* nocapture %_cmd) nounwind readonly optsize ssp {
+entry:
+; CHECK-LABEL: fold:
+; CHECK: ldr d0, [x0, x{{[0-9]+}}]
+; CHECK-NOT: add x0, x0, x1
+; CHECK: ret
+ %ivar = load i64* @"OBJC_IVAR_$_UIScreen._bounds", align 8, !invariant.load !4
+ %0 = bitcast %0* %self to i8*
+ %add.ptr = getelementptr inbounds i8* %0, i64 %ivar
+ %add.ptr10.0 = bitcast i8* %add.ptr to double*
+ %tmp11 = load double* %add.ptr10.0, align 8
+ %add.ptr10.1 = getelementptr inbounds i8* %0, i64 %ivar
+ %1 = bitcast i8* %add.ptr10.1 to double*
+ %tmp12 = load double* %1, align 8
+ %add.ptr4.1 = getelementptr inbounds i8* %0, i64 %ivar
+ %add.ptr4.1.0 = bitcast i8* %add.ptr4.1 to double*
+ %tmp = load double* %add.ptr4.1.0, align 8
+ %add.ptr4.1.1 = getelementptr inbounds i8* %0, i64 %ivar
+ %2 = bitcast i8* %add.ptr4.1.1 to double*
+ %tmp5 = load double* %2, align 8
+ %insert14 = insertvalue %struct.CGPoint undef, double %tmp11, 0
+ %insert16 = insertvalue %struct.CGPoint %insert14, double %tmp12, 1
+ %insert = insertvalue %struct.CGRect undef, %struct.CGPoint %insert16, 0
+ %insert7 = insertvalue %struct.CGSize undef, double %tmp, 0
+ %insert9 = insertvalue %struct.CGSize %insert7, double %tmp5, 1
+ %insert3 = insertvalue %struct.CGRect %insert, %struct.CGSize %insert9, 1
+ ret %struct.CGRect %insert3
+}
+
+
+!llvm.module.flags = !{!0, !1, !2, !3}
+
+!0 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
+!1 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
+!2 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
+!3 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0}
+!4 = metadata !{}
diff --git a/test/CodeGen/AArch64/arm64-fold-lsl.ll b/test/CodeGen/AArch64/arm64-fold-lsl.ll
new file mode 100644
index 0000000..ec65e46
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fold-lsl.ll
@@ -0,0 +1,79 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+;
+; <rdar://problem/14486451>
+
+%struct.a = type [256 x i16]
+%struct.b = type [256 x i32]
+%struct.c = type [256 x i64]
+
+define i16 @load_halfword(%struct.a* %ctx, i32 %xor72) nounwind {
+; CHECK-LABEL: load_halfword:
+; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8
+; CHECK: ldrh w0, [x0, [[REG]], lsl #1]
+ %shr81 = lshr i32 %xor72, 9
+ %conv82 = zext i32 %shr81 to i64
+ %idxprom83 = and i64 %conv82, 255
+ %arrayidx86 = getelementptr inbounds %struct.a* %ctx, i64 0, i64 %idxprom83
+ %result = load i16* %arrayidx86, align 2
+ ret i16 %result
+}
+
+define i32 @load_word(%struct.b* %ctx, i32 %xor72) nounwind {
+; CHECK-LABEL: load_word:
+; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8
+; CHECK: ldr w0, [x0, [[REG]], lsl #2]
+ %shr81 = lshr i32 %xor72, 9
+ %conv82 = zext i32 %shr81 to i64
+ %idxprom83 = and i64 %conv82, 255
+ %arrayidx86 = getelementptr inbounds %struct.b* %ctx, i64 0, i64 %idxprom83
+ %result = load i32* %arrayidx86, align 4
+ ret i32 %result
+}
+
+define i64 @load_doubleword(%struct.c* %ctx, i32 %xor72) nounwind {
+; CHECK-LABEL: load_doubleword:
+; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8
+; CHECK: ldr x0, [x0, [[REG]], lsl #3]
+ %shr81 = lshr i32 %xor72, 9
+ %conv82 = zext i32 %shr81 to i64
+ %idxprom83 = and i64 %conv82, 255
+ %arrayidx86 = getelementptr inbounds %struct.c* %ctx, i64 0, i64 %idxprom83
+ %result = load i64* %arrayidx86, align 8
+ ret i64 %result
+}
+
+define void @store_halfword(%struct.a* %ctx, i32 %xor72, i16 %val) nounwind {
+; CHECK-LABEL: store_halfword:
+; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8
+; CHECK: strh w2, [x0, [[REG]], lsl #1]
+ %shr81 = lshr i32 %xor72, 9
+ %conv82 = zext i32 %shr81 to i64
+ %idxprom83 = and i64 %conv82, 255
+ %arrayidx86 = getelementptr inbounds %struct.a* %ctx, i64 0, i64 %idxprom83
+ store i16 %val, i16* %arrayidx86, align 8
+ ret void
+}
+
+define void @store_word(%struct.b* %ctx, i32 %xor72, i32 %val) nounwind {
+; CHECK-LABEL: store_word:
+; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8
+; CHECK: str w2, [x0, [[REG]], lsl #2]
+ %shr81 = lshr i32 %xor72, 9
+ %conv82 = zext i32 %shr81 to i64
+ %idxprom83 = and i64 %conv82, 255
+ %arrayidx86 = getelementptr inbounds %struct.b* %ctx, i64 0, i64 %idxprom83
+ store i32 %val, i32* %arrayidx86, align 8
+ ret void
+}
+
+define void @store_doubleword(%struct.c* %ctx, i32 %xor72, i64 %val) nounwind {
+; CHECK-LABEL: store_doubleword:
+; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8
+; CHECK: str x2, [x0, [[REG]], lsl #3]
+ %shr81 = lshr i32 %xor72, 9
+ %conv82 = zext i32 %shr81 to i64
+ %idxprom83 = and i64 %conv82, 255
+ %arrayidx86 = getelementptr inbounds %struct.c* %ctx, i64 0, i64 %idxprom83
+ store i64 %val, i64* %arrayidx86, align 8
+ ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-fp-contract-zero.ll b/test/CodeGen/AArch64/arm64-fp-contract-zero.ll
new file mode 100644
index 0000000..f982cbb
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fp-contract-zero.ll
@@ -0,0 +1,14 @@
+; RUN: llc -mtriple=arm64 -fp-contract=fast -o - %s | FileCheck %s
+
+
+; Make sure we don't try to fold an fneg into +0.0, creating an illegal constant
+; -0.0. It's also good, though not essential, that we don't resort to a litpool.
+define double @test_fms_fold(double %a, double %b) {
+; CHECK-LABEL: test_fms_fold:
+; CHECK: fmov {{d[0-9]+}}, xzr
+; CHECK: ret
+ %mul = fmul double %a, 0.000000e+00
+ %mul1 = fmul double %b, 0.000000e+00
+ %sub = fsub double %mul, %mul1
+ ret double %sub
+} \ No newline at end of file
diff --git a/test/CodeGen/AArch64/arm64-fp-imm.ll b/test/CodeGen/AArch64/arm64-fp-imm.ll
new file mode 100644
index 0000000..6e271e0
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fp-imm.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
+
+; CHECK: literal8
+; CHECK: .quad 4614256656552045848
+define double @foo() {
+; CHECK: _foo:
+; CHECK: adrp x[[REG:[0-9]+]], lCPI0_0@PAGE
+; CHECK: ldr d0, [x[[REG]], lCPI0_0@PAGEOFF]
+; CHECK-NEXT: ret
+ ret double 0x400921FB54442D18
+}
+
+; CHECK: literal4
+; CHECK: .long 1078530011
+define float @bar() {
+; CHECK: _bar:
+; CHECK: adrp x[[REG:[0-9]+]], lCPI1_0@PAGE
+; CHECK: ldr s0, [x[[REG]], lCPI1_0@PAGEOFF]
+; CHECK-NEXT: ret
+ ret float 0x400921FB60000000
+}
+
+; CHECK: literal16
+; CHECK: .quad 0
+; CHECK: .quad 0
+define fp128 @baz() {
+; CHECK: _baz:
+; CHECK: adrp x[[REG:[0-9]+]], lCPI2_0@PAGE
+; CHECK: ldr q0, [x[[REG]], lCPI2_0@PAGEOFF]
+; CHECK-NEXT: ret
+ ret fp128 0xL00000000000000000000000000000000
+}
diff --git a/test/CodeGen/AArch64/arm64-fp.ll b/test/CodeGen/AArch64/arm64-fp.ll
new file mode 100644
index 0000000..08b1b67
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fp.ll
@@ -0,0 +1,8 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define float @t1(i1 %a, float %b, float %c) nounwind {
+; CHECK: t1
+; CHECK: fcsel s0, s0, s1, ne
+ %sel = select i1 %a, float %b, float %c
+ ret float %sel
+}
diff --git a/test/CodeGen/AArch64/arm64-fp128-folding.ll b/test/CodeGen/AArch64/arm64-fp128-folding.ll
new file mode 100644
index 0000000..6a7d203
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fp128-folding.ll
@@ -0,0 +1,17 @@
+; RUN: llc -march=arm64 -verify-machineinstrs < %s | FileCheck %s
+declare void @bar(i8*, i8*, i32*)
+
+; SelectionDAG used to try to fold some fp128 operations using the ppc128 type,
+; which is not supported.
+
+define fp128 @test_folding() {
+; CHECK-LABEL: test_folding:
+ %l = alloca i32
+ store i32 42, i32* %l
+ %val = load i32* %l
+ %fpval = sitofp i32 %val to fp128
+ ; If the value is loaded from a constant pool into an fp128, it's been folded
+ ; successfully.
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}},
+ ret fp128 %fpval
+}
diff --git a/test/CodeGen/AArch64/fp128.ll b/test/CodeGen/AArch64/arm64-fp128.ll
index c312bb1..57bbb93 100644
--- a/test/CodeGen/AArch64/fp128.ll
+++ b/test/CodeGen/AArch64/arm64-fp128.ll
@@ -1,15 +1,15 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu -verify-machineinstrs -mcpu=cyclone < %s | FileCheck %s
-@lhs = global fp128 zeroinitializer
-@rhs = global fp128 zeroinitializer
+@lhs = global fp128 zeroinitializer, align 16
+@rhs = global fp128 zeroinitializer, align 16
define fp128 @test_add() {
; CHECK-LABEL: test_add:
- %lhs = load fp128* @lhs
- %rhs = load fp128* @rhs
-; CHECK: ldr q0, [{{x[0-9]+}}, #:lo12:lhs]
-; CHECK: ldr q1, [{{x[0-9]+}}, #:lo12:rhs]
+ %lhs = load fp128* @lhs, align 16
+ %rhs = load fp128* @rhs, align 16
+; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
+; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
%val = fadd fp128 %lhs, %rhs
; CHECK: bl __addtf3
@@ -19,10 +19,10 @@ define fp128 @test_add() {
define fp128 @test_sub() {
; CHECK-LABEL: test_sub:
- %lhs = load fp128* @lhs
- %rhs = load fp128* @rhs
-; CHECK: ldr q0, [{{x[0-9]+}}, #:lo12:lhs]
-; CHECK: ldr q1, [{{x[0-9]+}}, #:lo12:rhs]
+ %lhs = load fp128* @lhs, align 16
+ %rhs = load fp128* @rhs, align 16
+; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
+; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
%val = fsub fp128 %lhs, %rhs
; CHECK: bl __subtf3
@@ -32,10 +32,10 @@ define fp128 @test_sub() {
define fp128 @test_mul() {
; CHECK-LABEL: test_mul:
- %lhs = load fp128* @lhs
- %rhs = load fp128* @rhs
-; CHECK: ldr q0, [{{x[0-9]+}}, #:lo12:lhs]
-; CHECK: ldr q1, [{{x[0-9]+}}, #:lo12:rhs]
+ %lhs = load fp128* @lhs, align 16
+ %rhs = load fp128* @rhs, align 16
+; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
+; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
%val = fmul fp128 %lhs, %rhs
; CHECK: bl __multf3
@@ -45,10 +45,10 @@ define fp128 @test_mul() {
define fp128 @test_div() {
; CHECK-LABEL: test_div:
- %lhs = load fp128* @lhs
- %rhs = load fp128* @rhs
-; CHECK: ldr q0, [{{x[0-9]+}}, #:lo12:lhs]
-; CHECK: ldr q1, [{{x[0-9]+}}, #:lo12:rhs]
+ %lhs = load fp128* @lhs, align 16
+ %rhs = load fp128* @rhs, align 16
+; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
+; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
%val = fdiv fp128 %lhs, %rhs
; CHECK: bl __divtf3
@@ -60,7 +60,7 @@ define fp128 @test_div() {
define void @test_fptosi() {
; CHECK-LABEL: test_fptosi:
- %val = load fp128* @lhs
+ %val = load fp128* @lhs, align 16
%val32 = fptosi fp128 %val to i32
store i32 %val32, i32* @var32
@@ -75,7 +75,7 @@ define void @test_fptosi() {
define void @test_fptoui() {
; CHECK-LABEL: test_fptoui:
- %val = load fp128* @lhs
+ %val = load fp128* @lhs, align 16
%val32 = fptoui fp128 %val to i32
store i32 %val32, i32* @var32
@@ -123,17 +123,17 @@ define void @test_uitofp() {
define i1 @test_setcc1() {
; CHECK-LABEL: test_setcc1:
- %lhs = load fp128* @lhs
- %rhs = load fp128* @rhs
-; CHECK: ldr q0, [{{x[0-9]+}}, #:lo12:lhs]
-; CHECK: ldr q1, [{{x[0-9]+}}, #:lo12:rhs]
+ %lhs = load fp128* @lhs, align 16
+ %rhs = load fp128* @rhs, align 16
+; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
+; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
; Technically, everything after the call to __letf2 is redundant, but we'll let
; LLVM have its fun for now.
%val = fcmp ole fp128 %lhs, %rhs
; CHECK: bl __letf2
; CHECK: cmp w0, #0
-; CHECK: csinc w0, wzr, wzr, gt
+; CHECK: cset w0, le
ret i1 %val
; CHECK: ret
@@ -142,22 +142,19 @@ define i1 @test_setcc1() {
define i1 @test_setcc2() {
; CHECK-LABEL: test_setcc2:
- %lhs = load fp128* @lhs
- %rhs = load fp128* @rhs
-; CHECK: ldr q0, [{{x[0-9]+}}, #:lo12:lhs]
-; CHECK: ldr q1, [{{x[0-9]+}}, #:lo12:rhs]
+ %lhs = load fp128* @lhs, align 16
+ %rhs = load fp128* @rhs, align 16
+; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
+; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
-; Technically, everything after the call to __letf2 is redundant, but we'll let
-; LLVM have its fun for now.
%val = fcmp ugt fp128 %lhs, %rhs
; CHECK: bl __gttf2
-; CHECK: cmp w0, #0
-; CHECK: csinc [[GT:w[0-9]+]], wzr, wzr, le
+; CHECK: cmp w0, #0
+; CHECK: cset [[GT:w[0-9]+]], gt
; CHECK: bl __unordtf2
-; CHECK: cmp w0, #0
-; CHECK: csinc [[UNORDERED:w[0-9]+]], wzr, wzr, eq
-
+; CHECK: cmp w0, #0
+; CHECK: cset [[UNORDERED:w[0-9]+]], ne
; CHECK: orr w0, [[UNORDERED]], [[GT]]
ret i1 %val
@@ -167,20 +164,20 @@ define i1 @test_setcc2() {
define i32 @test_br_cc() {
; CHECK-LABEL: test_br_cc:
- %lhs = load fp128* @lhs
- %rhs = load fp128* @rhs
-; CHECK: ldr q0, [{{x[0-9]+}}, #:lo12:lhs]
-; CHECK: ldr q1, [{{x[0-9]+}}, #:lo12:rhs]
+ %lhs = load fp128* @lhs, align 16
+ %rhs = load fp128* @rhs, align 16
+; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
+; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
; olt == !uge, which LLVM unfortunately "optimizes" this to.
%cond = fcmp olt fp128 %lhs, %rhs
; CHECK: bl __getf2
-; CHECK: cmp w0, #0
-; CHECK: csinc [[OGE:w[0-9]+]], wzr, wzr, lt
+; CHECK: cmp w0, #0
+; CHECK: cset [[OGE:w[0-9]+]], ge
; CHECK: bl __unordtf2
-; CHECK: cmp w0, #0
-; CHECK: csinc [[UNORDERED:w[0-9]+]], wzr, wzr, eq
+; CHECK: cmp w0, #0
+; CHECK: cset [[UNORDERED:w[0-9]+]], ne
; CHECK: orr [[UGE:w[0-9]+]], [[UNORDERED]], [[OGE]]
; CHECK: cbnz [[UGE]], [[RET29:.LBB[0-9]+_[0-9]+]]
@@ -189,13 +186,13 @@ define i32 @test_br_cc() {
iftrue:
ret i32 42
; CHECK-NEXT: BB#
-; CHECK-NEXT: movz x0, #42
+; CHECK-NEXT: movz w0, #0x2a
; CHECK-NEXT: b [[REALRET:.LBB[0-9]+_[0-9]+]]
iffalse:
ret i32 29
; CHECK: [[RET29]]:
-; CHECK-NEXT: movz x0, #29
+; CHECK-NEXT: movz w0, #0x1d
; CHECK-NEXT: [[REALRET]]:
; CHECK: ret
}
@@ -204,36 +201,34 @@ define void @test_select(i1 %cond, fp128 %lhs, fp128 %rhs) {
; CHECK-LABEL: test_select:
%val = select i1 %cond, fp128 %lhs, fp128 %rhs
- store fp128 %val, fp128* @lhs
-; CHECK: cmp w0, #0
-; CHECK: str q1, [sp]
+ store fp128 %val, fp128* @lhs, align 16
+; CHECK: tst w0, #0x1
; CHECK-NEXT: b.eq [[IFFALSE:.LBB[0-9]+_[0-9]+]]
; CHECK-NEXT: BB#
-; CHECK-NEXT: str q0, [sp]
+; CHECK-NEXT: mov v[[VAL:[0-9]+]].16b, v0.16b
; CHECK-NEXT: [[IFFALSE]]:
-; CHECK-NEXT: ldr q0, [sp]
-; CHECK: str q0, [{{x[0-9]+}}, #:lo12:lhs]
+; CHECK: str q[[VAL]], [{{x[0-9]+}}, :lo12:lhs]
ret void
; CHECK: ret
}
-@varfloat = global float 0.0
-@vardouble = global double 0.0
+@varfloat = global float 0.0, align 4
+@vardouble = global double 0.0, align 8
define void @test_round() {
; CHECK-LABEL: test_round:
- %val = load fp128* @lhs
+ %val = load fp128* @lhs, align 16
%float = fptrunc fp128 %val to float
- store float %float, float* @varfloat
+ store float %float, float* @varfloat, align 4
; CHECK: bl __trunctfsf2
-; CHECK: str s0, [{{x[0-9]+}}, #:lo12:varfloat]
+; CHECK: str s0, [{{x[0-9]+}}, :lo12:varfloat]
%double = fptrunc fp128 %val to double
- store double %double, double* @vardouble
+ store double %double, double* @vardouble, align 8
; CHECK: bl __trunctfdf2
-; CHECK: str d0, [{{x[0-9]+}}, #:lo12:vardouble]
+; CHECK: str d0, [{{x[0-9]+}}, :lo12:vardouble]
ret void
}
@@ -241,19 +236,19 @@ define void @test_round() {
define void @test_extend() {
; CHECK-LABEL: test_extend:
- %val = load fp128* @lhs
+ %val = load fp128* @lhs, align 16
%float = load float* @varfloat
%fromfloat = fpext float %float to fp128
- store volatile fp128 %fromfloat, fp128* @lhs
+ store volatile fp128 %fromfloat, fp128* @lhs, align 16
; CHECK: bl __extendsftf2
-; CHECK: str q0, [{{x[0-9]+}}, #:lo12:lhs]
+; CHECK: str q0, [{{x[0-9]+}}, :lo12:lhs]
%double = load double* @vardouble
%fromdouble = fpext double %double to fp128
- store volatile fp128 %fromdouble, fp128* @lhs
+ store volatile fp128 %fromdouble, fp128* @lhs, align 16
; CHECK: bl __extenddftf2
-; CHECK: str q0, [{{x[0-9]+}}, #:lo12:lhs]
+; CHECK: str q0, [{{x[0-9]+}}, :lo12:lhs]
ret void
; CHECK: ret
@@ -269,9 +264,8 @@ define fp128 @test_neg(fp128 %in) {
; Could in principle be optimized to fneg which we can't select, this makes
; sure that doesn't happen.
%ret = fsub fp128 0xL00000000000000008000000000000000, %in
-; CHECK: str q0, [sp, #-16]
-; CHECK-NEXT: ldr q1, [sp], #16
-; CHECK: ldr q0, [{{x[0-9]+}}, #:lo12:[[MINUS0]]]
+; CHECK: mov v1.16b, v0.16b
+; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:[[MINUS0]]]
; CHECK: bl __subtf3
ret fp128 %ret
diff --git a/test/CodeGen/AArch64/arm64-frame-index.ll b/test/CodeGen/AArch64/arm64-frame-index.ll
new file mode 100644
index 0000000..4a91ff3
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-frame-index.ll
@@ -0,0 +1,11 @@
+; RUN: llc -march=arm64 -mtriple=arm64-apple-ios < %s | FileCheck %s
+; rdar://11935841
+
+define void @t1() nounwind ssp {
+entry:
+; CHECK-LABEL: t1:
+; CHECK-NOT: add x{{[0-9]+}}, sp
+; CHECK: stp x28, x27, [sp, #-16]!
+ %v = alloca [288 x i32], align 4
+ unreachable
+}
diff --git a/test/CodeGen/AArch64/arm64-frameaddr.ll b/test/CodeGen/AArch64/arm64-frameaddr.ll
new file mode 100644
index 0000000..469078c
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-frameaddr.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define i8* @t() nounwind {
+entry:
+; CHECK-LABEL: t:
+; CHECK: stp x29, x30, [sp, #-16]!
+; CHECK: mov x29, sp
+; CHECK: mov x0, x29
+; CHECK: ldp x29, x30, [sp], #16
+; CHECK: ret
+ %0 = call i8* @llvm.frameaddress(i32 0)
+ ret i8* %0
+}
+
+declare i8* @llvm.frameaddress(i32) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-global-address.ll b/test/CodeGen/AArch64/arm64-global-address.ll
new file mode 100644
index 0000000..005f414
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-global-address.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
+; rdar://9618644
+
+@G = external global i32
+
+define i32 @test(i32 %off) nounwind {
+; CHECK-LABEL: test:
+; CHECK: adrp x[[REG:[0-9]+]], _G@GOTPAGE
+; CHECK: ldr x[[REG2:[0-9]+]], [x[[REG]], _G@GOTPAGEOFF]
+; CHECK: add w0, w[[REG2]], w0
+ %tmp = ptrtoint i32* @G to i32
+ %tmp1 = add i32 %tmp, %off
+ ret i32 %tmp1
+}
diff --git a/test/CodeGen/AArch64/arm64-hello.ll b/test/CodeGen/AArch64/arm64-hello.ll
new file mode 100644
index 0000000..a6346fb
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-hello.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s --check-prefix=CHECK-LINUX
+
+; CHECK-LABEL: main:
+; CHECK: stp x29, x30, [sp, #-16]!
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: stur wzr, [x29, #-4]
+; CHECK: adrp x0, L_.str@PAGE
+; CHECK: add x0, x0, L_.str@PAGEOFF
+; CHECK-NEXT: bl _puts
+; CHECK-NEXT: mov sp, x29
+; CHECK-NEXT: ldp x29, x30, [sp], #16
+; CHECK-NEXT: ret
+
+; CHECK-LINUX-LABEL: main:
+; CHECK-LINUX: stp x29, x30, [sp, #-16]!
+; CHECK-LINUX-NEXT: mov x29, sp
+; CHECK-LINUX-NEXT: sub sp, sp, #16
+; CHECK-LINUX-NEXT: stur wzr, [x29, #-4]
+; CHECK-LINUX: adrp x0, .L.str
+; CHECK-LINUX: add x0, x0, :lo12:.L.str
+; CHECK-LINUX-NEXT: bl puts
+; CHECK-LINUX-NEXT: mov sp, x29
+; CHECK-LINUX-NEXT: ldp x29, x30, [sp], #16
+; CHECK-LINUX-NEXT: ret
+
+@.str = private unnamed_addr constant [7 x i8] c"hello\0A\00"
+
+define i32 @main() nounwind ssp {
+entry:
+ %retval = alloca i32, align 4
+ store i32 0, i32* %retval
+ %call = call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @.str, i32 0, i32 0))
+ ret i32 %call
+}
+
+declare i32 @puts(i8*)
diff --git a/test/CodeGen/AArch64/arm64-i16-subreg-extract.ll b/test/CodeGen/AArch64/arm64-i16-subreg-extract.ll
new file mode 100644
index 0000000..ba759e3
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-i16-subreg-extract.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define i32 @foo(<4 x i16>* %__a) nounwind {
+; CHECK-LABEL: foo:
+; CHECK: umov.h w{{[0-9]+}}, v{{[0-9]+}}[0]
+ %tmp18 = load <4 x i16>* %__a, align 8
+ %vget_lane = extractelement <4 x i16> %tmp18, i32 0
+ %conv = zext i16 %vget_lane to i32
+ %mul = mul nsw i32 3, %conv
+ ret i32 %mul
+}
+
diff --git a/test/CodeGen/AArch64/arm64-icmp-opt.ll b/test/CodeGen/AArch64/arm64-icmp-opt.ll
new file mode 100644
index 0000000..7b12ed7
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-icmp-opt.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+; Optimize (x > -1) to (x >= 0) etc.
+; Optimize (cmp (add / sub), 0): eliminate the subs used to update flag
+; for comparison only
+; rdar://10233472
+
+define i32 @t1(i64 %a) nounwind ssp {
+entry:
+; CHECK-LABEL: t1:
+; CHECK-NOT: movn
+; CHECK: cmp x0, #0
+; CHECK: cset w0, ge
+ %cmp = icmp sgt i64 %a, -1
+ %conv = zext i1 %cmp to i32
+ ret i32 %conv
+}
diff --git a/test/CodeGen/AArch64/arm64-illegal-float-ops.ll b/test/CodeGen/AArch64/arm64-illegal-float-ops.ll
new file mode 100644
index 0000000..9a35fe5
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-illegal-float-ops.ll
@@ -0,0 +1,295 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+
+@varfloat = global float 0.0
+@vardouble = global double 0.0
+@varfp128 = global fp128 zeroinitializer
+
+declare float @llvm.cos.f32(float)
+declare double @llvm.cos.f64(double)
+declare fp128 @llvm.cos.f128(fp128)
+
+define void @test_cos(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_cos:
+
+ %cosfloat = call float @llvm.cos.f32(float %float)
+ store float %cosfloat, float* @varfloat
+; CHECK: bl cosf
+
+ %cosdouble = call double @llvm.cos.f64(double %double)
+ store double %cosdouble, double* @vardouble
+; CHECK: bl cos
+
+ %cosfp128 = call fp128 @llvm.cos.f128(fp128 %fp128)
+ store fp128 %cosfp128, fp128* @varfp128
+; CHECK: bl cosl
+
+ ret void
+}
+
+declare float @llvm.exp.f32(float)
+declare double @llvm.exp.f64(double)
+declare fp128 @llvm.exp.f128(fp128)
+
+define void @test_exp(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_exp:
+
+ %expfloat = call float @llvm.exp.f32(float %float)
+ store float %expfloat, float* @varfloat
+; CHECK: bl expf
+
+ %expdouble = call double @llvm.exp.f64(double %double)
+ store double %expdouble, double* @vardouble
+; CHECK: bl exp
+
+ %expfp128 = call fp128 @llvm.exp.f128(fp128 %fp128)
+ store fp128 %expfp128, fp128* @varfp128
+; CHECK: bl expl
+
+ ret void
+}
+
+declare float @llvm.exp2.f32(float)
+declare double @llvm.exp2.f64(double)
+declare fp128 @llvm.exp2.f128(fp128)
+
+define void @test_exp2(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_exp2:
+
+ %exp2float = call float @llvm.exp2.f32(float %float)
+ store float %exp2float, float* @varfloat
+; CHECK: bl exp2f
+
+ %exp2double = call double @llvm.exp2.f64(double %double)
+ store double %exp2double, double* @vardouble
+; CHECK: bl exp2
+
+ %exp2fp128 = call fp128 @llvm.exp2.f128(fp128 %fp128)
+ store fp128 %exp2fp128, fp128* @varfp128
+; CHECK: bl exp2l
+ ret void
+
+}
+
+declare float @llvm.log.f32(float)
+declare double @llvm.log.f64(double)
+declare fp128 @llvm.log.f128(fp128)
+
+define void @test_log(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_log:
+
+ %logfloat = call float @llvm.log.f32(float %float)
+ store float %logfloat, float* @varfloat
+; CHECK: bl logf
+
+ %logdouble = call double @llvm.log.f64(double %double)
+ store double %logdouble, double* @vardouble
+; CHECK: bl log
+
+ %logfp128 = call fp128 @llvm.log.f128(fp128 %fp128)
+ store fp128 %logfp128, fp128* @varfp128
+; CHECK: bl logl
+
+ ret void
+}
+
+declare float @llvm.log2.f32(float)
+declare double @llvm.log2.f64(double)
+declare fp128 @llvm.log2.f128(fp128)
+
+define void @test_log2(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_log2:
+
+ %log2float = call float @llvm.log2.f32(float %float)
+ store float %log2float, float* @varfloat
+; CHECK: bl log2f
+
+ %log2double = call double @llvm.log2.f64(double %double)
+ store double %log2double, double* @vardouble
+; CHECK: bl log2
+
+ %log2fp128 = call fp128 @llvm.log2.f128(fp128 %fp128)
+ store fp128 %log2fp128, fp128* @varfp128
+; CHECK: bl log2l
+ ret void
+
+}
+
+declare float @llvm.log10.f32(float)
+declare double @llvm.log10.f64(double)
+declare fp128 @llvm.log10.f128(fp128)
+
+define void @test_log10(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_log10:
+
+ %log10float = call float @llvm.log10.f32(float %float)
+ store float %log10float, float* @varfloat
+; CHECK: bl log10f
+
+ %log10double = call double @llvm.log10.f64(double %double)
+ store double %log10double, double* @vardouble
+; CHECK: bl log10
+
+ %log10fp128 = call fp128 @llvm.log10.f128(fp128 %fp128)
+ store fp128 %log10fp128, fp128* @varfp128
+; CHECK: bl log10l
+
+ ret void
+}
+
+declare float @llvm.sin.f32(float)
+declare double @llvm.sin.f64(double)
+declare fp128 @llvm.sin.f128(fp128)
+
+define void @test_sin(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_sin:
+
+ %sinfloat = call float @llvm.sin.f32(float %float)
+ store float %sinfloat, float* @varfloat
+; CHECK: bl sinf
+
+ %sindouble = call double @llvm.sin.f64(double %double)
+ store double %sindouble, double* @vardouble
+; CHECK: bl sin
+
+ %sinfp128 = call fp128 @llvm.sin.f128(fp128 %fp128)
+ store fp128 %sinfp128, fp128* @varfp128
+; CHECK: bl sinl
+ ret void
+
+}
+
+declare float @llvm.pow.f32(float, float)
+declare double @llvm.pow.f64(double, double)
+declare fp128 @llvm.pow.f128(fp128, fp128)
+
+define void @test_pow(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_pow:
+
+ %powfloat = call float @llvm.pow.f32(float %float, float %float)
+ store float %powfloat, float* @varfloat
+; CHECK: bl powf
+
+ %powdouble = call double @llvm.pow.f64(double %double, double %double)
+ store double %powdouble, double* @vardouble
+; CHECK: bl pow
+
+ %powfp128 = call fp128 @llvm.pow.f128(fp128 %fp128, fp128 %fp128)
+ store fp128 %powfp128, fp128* @varfp128
+; CHECK: bl powl
+
+ ret void
+}
+
+declare float @llvm.powi.f32(float, i32)
+declare double @llvm.powi.f64(double, i32)
+declare fp128 @llvm.powi.f128(fp128, i32)
+
+define void @test_powi(float %float, double %double, i32 %exponent, fp128 %fp128) {
+; CHECK-LABEL: test_powi:
+
+ %powifloat = call float @llvm.powi.f32(float %float, i32 %exponent)
+ store float %powifloat, float* @varfloat
+; CHECK: bl __powisf2
+
+ %powidouble = call double @llvm.powi.f64(double %double, i32 %exponent)
+ store double %powidouble, double* @vardouble
+; CHECK: bl __powidf2
+
+ %powifp128 = call fp128 @llvm.powi.f128(fp128 %fp128, i32 %exponent)
+ store fp128 %powifp128, fp128* @varfp128
+; CHECK: bl __powitf2
+ ret void
+
+}
+
+define void @test_frem(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_frem:
+
+ %fremfloat = frem float %float, %float
+ store float %fremfloat, float* @varfloat
+; CHECK: bl fmodf
+
+ %fremdouble = frem double %double, %double
+ store double %fremdouble, double* @vardouble
+; CHECK: bl fmod
+
+ %fremfp128 = frem fp128 %fp128, %fp128
+ store fp128 %fremfp128, fp128* @varfp128
+; CHECK: bl fmodl
+
+ ret void
+}
+
+declare fp128 @llvm.fma.f128(fp128, fp128, fp128)
+
+define void @test_fma(fp128 %fp128) {
+; CHECK-LABEL: test_fma:
+
+ %fmafp128 = call fp128 @llvm.fma.f128(fp128 %fp128, fp128 %fp128, fp128 %fp128)
+ store fp128 %fmafp128, fp128* @varfp128
+; CHECK: bl fmal
+
+ ret void
+}
+
+declare fp128 @llvm.fmuladd.f128(fp128, fp128, fp128)
+
+define void @test_fmuladd(fp128 %fp128) {
+; CHECK-LABEL: test_fmuladd:
+
+ %fmuladdfp128 = call fp128 @llvm.fmuladd.f128(fp128 %fp128, fp128 %fp128, fp128 %fp128)
+ store fp128 %fmuladdfp128, fp128* @varfp128
+; CHECK-NOT: bl fmal
+; CHECK: bl __multf3
+; CHECK: bl __addtf3
+
+ ret void
+}
+
+define i32 @test_fptosi32(fp128 %a) {
+; CHECK-LABEL: test_fptosi32:
+; CHECK: bl __fixtfsi
+ %conv.i = fptosi fp128 %a to i32
+ %b = add nsw i32 %conv.i, 48
+ ret i32 %b
+}
+
+define i64 @test_fptosi64(fp128 %a) {
+; CHECK-LABEL: test_fptosi64:
+; CHECK: bl __fixtfdi
+ %conv.i = fptosi fp128 %a to i64
+ %b = add nsw i64 %conv.i, 48
+ ret i64 %b
+}
+
+define i128 @test_fptosi128(fp128 %a) {
+; CHECK-LABEL: test_fptosi128:
+; CHECK: bl __fixtfti
+ %conv.i = fptosi fp128 %a to i128
+ %b = add nsw i128 %conv.i, 48
+ ret i128 %b
+}
+
+define i32 @test_fptoui32(fp128 %a) {
+; CHECK-LABEL: test_fptoui32:
+; CHECK: bl __fixunstfsi
+ %conv.i = fptoui fp128 %a to i32
+ %b = add nsw i32 %conv.i, 48
+ ret i32 %b
+}
+
+define i64 @test_fptoui64(fp128 %a) {
+; CHECK-LABEL: test_fptoui64:
+; CHECK: bl __fixunstfdi
+ %conv.i = fptoui fp128 %a to i64
+ %b = add nsw i64 %conv.i, 48
+ ret i64 %b
+}
+
+define i128 @test_fptoui128(fp128 %a) {
+; CHECK-LABEL: test_fptoui128:
+; CHECK: bl __fixunstfti
+ %conv.i = fptoui fp128 %a to i128
+ %b = add nsw i128 %conv.i, 48
+ ret i128 %b
+}
diff --git a/test/CodeGen/AArch64/arm64-indexed-memory.ll b/test/CodeGen/AArch64/arm64-indexed-memory.ll
new file mode 100644
index 0000000..e501c6e
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-indexed-memory.ll
@@ -0,0 +1,351 @@
+; RUN: llc < %s -march=arm64 -aarch64-redzone | FileCheck %s
+
+define void @store64(i64** nocapture %out, i64 %index, i64 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: store64:
+; CHECK: str x{{[0-9+]}}, [x{{[0-9+]}}], #8
+; CHECK: ret
+ %tmp = load i64** %out, align 8
+ %incdec.ptr = getelementptr inbounds i64* %tmp, i64 1
+ store i64 %spacing, i64* %tmp, align 4
+ store i64* %incdec.ptr, i64** %out, align 8
+ ret void
+}
+
+define void @store32(i32** nocapture %out, i32 %index, i32 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: store32:
+; CHECK: str w{{[0-9+]}}, [x{{[0-9+]}}], #4
+; CHECK: ret
+ %tmp = load i32** %out, align 8
+ %incdec.ptr = getelementptr inbounds i32* %tmp, i64 1
+ store i32 %spacing, i32* %tmp, align 4
+ store i32* %incdec.ptr, i32** %out, align 8
+ ret void
+}
+
+define void @store16(i16** nocapture %out, i16 %index, i16 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: store16:
+; CHECK: strh w{{[0-9+]}}, [x{{[0-9+]}}], #2
+; CHECK: ret
+ %tmp = load i16** %out, align 8
+ %incdec.ptr = getelementptr inbounds i16* %tmp, i64 1
+ store i16 %spacing, i16* %tmp, align 4
+ store i16* %incdec.ptr, i16** %out, align 8
+ ret void
+}
+
+define void @store8(i8** nocapture %out, i8 %index, i8 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: store8:
+; CHECK: strb w{{[0-9+]}}, [x{{[0-9+]}}], #1
+; CHECK: ret
+ %tmp = load i8** %out, align 8
+ %incdec.ptr = getelementptr inbounds i8* %tmp, i64 1
+ store i8 %spacing, i8* %tmp, align 4
+ store i8* %incdec.ptr, i8** %out, align 8
+ ret void
+}
+
+define void @truncst64to32(i32** nocapture %out, i32 %index, i64 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: truncst64to32:
+; CHECK: str w{{[0-9+]}}, [x{{[0-9+]}}], #4
+; CHECK: ret
+ %tmp = load i32** %out, align 8
+ %incdec.ptr = getelementptr inbounds i32* %tmp, i64 1
+ %trunc = trunc i64 %spacing to i32
+ store i32 %trunc, i32* %tmp, align 4
+ store i32* %incdec.ptr, i32** %out, align 8
+ ret void
+}
+
+define void @truncst64to16(i16** nocapture %out, i16 %index, i64 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: truncst64to16:
+; CHECK: strh w{{[0-9+]}}, [x{{[0-9+]}}], #2
+; CHECK: ret
+ %tmp = load i16** %out, align 8
+ %incdec.ptr = getelementptr inbounds i16* %tmp, i64 1
+ %trunc = trunc i64 %spacing to i16
+ store i16 %trunc, i16* %tmp, align 4
+ store i16* %incdec.ptr, i16** %out, align 8
+ ret void
+}
+
+define void @truncst64to8(i8** nocapture %out, i8 %index, i64 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: truncst64to8:
+; CHECK: strb w{{[0-9+]}}, [x{{[0-9+]}}], #1
+; CHECK: ret
+ %tmp = load i8** %out, align 8
+ %incdec.ptr = getelementptr inbounds i8* %tmp, i64 1
+ %trunc = trunc i64 %spacing to i8
+ store i8 %trunc, i8* %tmp, align 4
+ store i8* %incdec.ptr, i8** %out, align 8
+ ret void
+}
+
+
+define void @storef32(float** nocapture %out, float %index, float %spacing) nounwind noinline ssp {
+; CHECK-LABEL: storef32:
+; CHECK: str s{{[0-9+]}}, [x{{[0-9+]}}], #4
+; CHECK: ret
+ %tmp = load float** %out, align 8
+ %incdec.ptr = getelementptr inbounds float* %tmp, i64 1
+ store float %spacing, float* %tmp, align 4
+ store float* %incdec.ptr, float** %out, align 8
+ ret void
+}
+
+define void @storef64(double** nocapture %out, double %index, double %spacing) nounwind noinline ssp {
+; CHECK-LABEL: storef64:
+; CHECK: str d{{[0-9+]}}, [x{{[0-9+]}}], #8
+; CHECK: ret
+ %tmp = load double** %out, align 8
+ %incdec.ptr = getelementptr inbounds double* %tmp, i64 1
+ store double %spacing, double* %tmp, align 4
+ store double* %incdec.ptr, double** %out, align 8
+ ret void
+}
+
+define double * @pref64(double** nocapture %out, double %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pref64:
+; CHECK: ldr x0, [x0]
+; CHECK-NEXT: str d0, [x0, #32]!
+; CHECK-NEXT: ret
+ %tmp = load double** %out, align 8
+ %ptr = getelementptr inbounds double* %tmp, i64 4
+ store double %spacing, double* %ptr, align 4
+ ret double *%ptr
+}
+
+define float * @pref32(float** nocapture %out, float %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pref32:
+; CHECK: ldr x0, [x0]
+; CHECK-NEXT: str s0, [x0, #12]!
+; CHECK-NEXT: ret
+ %tmp = load float** %out, align 8
+ %ptr = getelementptr inbounds float* %tmp, i64 3
+ store float %spacing, float* %ptr, align 4
+ ret float *%ptr
+}
+
+define i64 * @pre64(i64** nocapture %out, i64 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pre64:
+; CHECK: ldr x0, [x0]
+; CHECK-NEXT: str x1, [x0, #16]!
+; CHECK-NEXT: ret
+ %tmp = load i64** %out, align 8
+ %ptr = getelementptr inbounds i64* %tmp, i64 2
+ store i64 %spacing, i64* %ptr, align 4
+ ret i64 *%ptr
+}
+
+define i32 * @pre32(i32** nocapture %out, i32 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pre32:
+; CHECK: ldr x0, [x0]
+; CHECK-NEXT: str w1, [x0, #8]!
+; CHECK-NEXT: ret
+ %tmp = load i32** %out, align 8
+ %ptr = getelementptr inbounds i32* %tmp, i64 2
+ store i32 %spacing, i32* %ptr, align 4
+ ret i32 *%ptr
+}
+
+define i16 * @pre16(i16** nocapture %out, i16 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pre16:
+; CHECK: ldr x0, [x0]
+; CHECK-NEXT: strh w1, [x0, #4]!
+; CHECK-NEXT: ret
+ %tmp = load i16** %out, align 8
+ %ptr = getelementptr inbounds i16* %tmp, i64 2
+ store i16 %spacing, i16* %ptr, align 4
+ ret i16 *%ptr
+}
+
+define i8 * @pre8(i8** nocapture %out, i8 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pre8:
+; CHECK: ldr x0, [x0]
+; CHECK-NEXT: strb w1, [x0, #2]!
+; CHECK-NEXT: ret
+ %tmp = load i8** %out, align 8
+ %ptr = getelementptr inbounds i8* %tmp, i64 2
+ store i8 %spacing, i8* %ptr, align 4
+ ret i8 *%ptr
+}
+
+define i32 * @pretrunc64to32(i32** nocapture %out, i64 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pretrunc64to32:
+; CHECK: ldr x0, [x0]
+; CHECK-NEXT: str w1, [x0, #8]!
+; CHECK-NEXT: ret
+ %tmp = load i32** %out, align 8
+ %ptr = getelementptr inbounds i32* %tmp, i64 2
+ %trunc = trunc i64 %spacing to i32
+ store i32 %trunc, i32* %ptr, align 4
+ ret i32 *%ptr
+}
+
+define i16 * @pretrunc64to16(i16** nocapture %out, i64 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pretrunc64to16:
+; CHECK: ldr x0, [x0]
+; CHECK-NEXT: strh w1, [x0, #4]!
+; CHECK-NEXT: ret
+ %tmp = load i16** %out, align 8
+ %ptr = getelementptr inbounds i16* %tmp, i64 2
+ %trunc = trunc i64 %spacing to i16
+ store i16 %trunc, i16* %ptr, align 4
+ ret i16 *%ptr
+}
+
+define i8 * @pretrunc64to8(i8** nocapture %out, i64 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pretrunc64to8:
+; CHECK: ldr x0, [x0]
+; CHECK-NEXT: strb w1, [x0, #2]!
+; CHECK-NEXT: ret
+ %tmp = load i8** %out, align 8
+ %ptr = getelementptr inbounds i8* %tmp, i64 2
+ %trunc = trunc i64 %spacing to i8
+ store i8 %trunc, i8* %ptr, align 4
+ ret i8 *%ptr
+}
+
+;-----
+; Pre-indexed loads
+;-----
+define double* @preidxf64(double* %src, double* %out) {
+; CHECK-LABEL: preidxf64:
+; CHECK: ldr d0, [x0, #8]!
+; CHECK: str d0, [x1]
+; CHECK: ret
+ %ptr = getelementptr inbounds double* %src, i64 1
+ %tmp = load double* %ptr, align 4
+ store double %tmp, double* %out, align 4
+ ret double* %ptr
+}
+
+define float* @preidxf32(float* %src, float* %out) {
+; CHECK-LABEL: preidxf32:
+; CHECK: ldr s0, [x0, #4]!
+; CHECK: str s0, [x1]
+; CHECK: ret
+ %ptr = getelementptr inbounds float* %src, i64 1
+ %tmp = load float* %ptr, align 4
+ store float %tmp, float* %out, align 4
+ ret float* %ptr
+}
+
+define i64* @preidx64(i64* %src, i64* %out) {
+; CHECK-LABEL: preidx64:
+; CHECK: ldr x[[REG:[0-9]+]], [x0, #8]!
+; CHECK: str x[[REG]], [x1]
+; CHECK: ret
+ %ptr = getelementptr inbounds i64* %src, i64 1
+ %tmp = load i64* %ptr, align 4
+ store i64 %tmp, i64* %out, align 4
+ ret i64* %ptr
+}
+
+define i32* @preidx32(i32* %src, i32* %out) {
+; CHECK: ldr w[[REG:[0-9]+]], [x0, #4]!
+; CHECK: str w[[REG]], [x1]
+; CHECK: ret
+ %ptr = getelementptr inbounds i32* %src, i64 1
+ %tmp = load i32* %ptr, align 4
+ store i32 %tmp, i32* %out, align 4
+ ret i32* %ptr
+}
+
+define i16* @preidx16zext32(i16* %src, i32* %out) {
+; CHECK: ldrh w[[REG:[0-9]+]], [x0, #2]!
+; CHECK: str w[[REG]], [x1]
+; CHECK: ret
+ %ptr = getelementptr inbounds i16* %src, i64 1
+ %tmp = load i16* %ptr, align 4
+ %ext = zext i16 %tmp to i32
+ store i32 %ext, i32* %out, align 4
+ ret i16* %ptr
+}
+
+define i16* @preidx16zext64(i16* %src, i64* %out) {
+; CHECK: ldrh w[[REG:[0-9]+]], [x0, #2]!
+; CHECK: str x[[REG]], [x1]
+; CHECK: ret
+ %ptr = getelementptr inbounds i16* %src, i64 1
+ %tmp = load i16* %ptr, align 4
+ %ext = zext i16 %tmp to i64
+ store i64 %ext, i64* %out, align 4
+ ret i16* %ptr
+}
+
+define i8* @preidx8zext32(i8* %src, i32* %out) {
+; CHECK: ldrb w[[REG:[0-9]+]], [x0, #1]!
+; CHECK: str w[[REG]], [x1]
+; CHECK: ret
+ %ptr = getelementptr inbounds i8* %src, i64 1
+ %tmp = load i8* %ptr, align 4
+ %ext = zext i8 %tmp to i32
+ store i32 %ext, i32* %out, align 4
+ ret i8* %ptr
+}
+
+define i8* @preidx8zext64(i8* %src, i64* %out) {
+; CHECK: ldrb w[[REG:[0-9]+]], [x0, #1]!
+; CHECK: str x[[REG]], [x1]
+; CHECK: ret
+ %ptr = getelementptr inbounds i8* %src, i64 1
+ %tmp = load i8* %ptr, align 4
+ %ext = zext i8 %tmp to i64
+ store i64 %ext, i64* %out, align 4
+ ret i8* %ptr
+}
+
+define i32* @preidx32sext64(i32* %src, i64* %out) {
+; CHECK: ldrsw x[[REG:[0-9]+]], [x0, #4]!
+; CHECK: str x[[REG]], [x1]
+; CHECK: ret
+ %ptr = getelementptr inbounds i32* %src, i64 1
+ %tmp = load i32* %ptr, align 4
+ %ext = sext i32 %tmp to i64
+ store i64 %ext, i64* %out, align 8
+ ret i32* %ptr
+}
+
+define i16* @preidx16sext32(i16* %src, i32* %out) {
+; CHECK: ldrsh w[[REG:[0-9]+]], [x0, #2]!
+; CHECK: str w[[REG]], [x1]
+; CHECK: ret
+ %ptr = getelementptr inbounds i16* %src, i64 1
+ %tmp = load i16* %ptr, align 4
+ %ext = sext i16 %tmp to i32
+ store i32 %ext, i32* %out, align 4
+ ret i16* %ptr
+}
+
+define i16* @preidx16sext64(i16* %src, i64* %out) {
+; CHECK: ldrsh x[[REG:[0-9]+]], [x0, #2]!
+; CHECK: str x[[REG]], [x1]
+; CHECK: ret
+ %ptr = getelementptr inbounds i16* %src, i64 1
+ %tmp = load i16* %ptr, align 4
+ %ext = sext i16 %tmp to i64
+ store i64 %ext, i64* %out, align 4
+ ret i16* %ptr
+}
+
+define i8* @preidx8sext32(i8* %src, i32* %out) {
+; CHECK: ldrsb w[[REG:[0-9]+]], [x0, #1]!
+; CHECK: str w[[REG]], [x1]
+; CHECK: ret
+ %ptr = getelementptr inbounds i8* %src, i64 1
+ %tmp = load i8* %ptr, align 4
+ %ext = sext i8 %tmp to i32
+ store i32 %ext, i32* %out, align 4
+ ret i8* %ptr
+}
+
+define i8* @preidx8sext64(i8* %src, i64* %out) {
+; CHECK: ldrsb x[[REG:[0-9]+]], [x0, #1]!
+; CHECK: str x[[REG]], [x1]
+; CHECK: ret
+ %ptr = getelementptr inbounds i8* %src, i64 1
+ %tmp = load i8* %ptr, align 4
+ %ext = sext i8 %tmp to i64
+ store i64 %ext, i64* %out, align 4
+ ret i8* %ptr
+}
diff --git a/test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll b/test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll
new file mode 100644
index 0000000..c118f10
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll
@@ -0,0 +1,40 @@
+; RUN: llc < %s
+
+; This used to assert with "Overran sorted position" in AssignTopologicalOrder
+; due to a cycle created in performPostLD1Combine.
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios7.0.0"
+
+; Function Attrs: nounwind ssp
+define void @f(double* %P1) #0 {
+entry:
+ %arrayidx4 = getelementptr inbounds double* %P1, i64 1
+ %0 = load double* %arrayidx4, align 8, !tbaa !1
+ %1 = load double* %P1, align 8, !tbaa !1
+ %2 = insertelement <2 x double> undef, double %0, i32 0
+ %3 = insertelement <2 x double> %2, double %1, i32 1
+ %4 = fsub <2 x double> zeroinitializer, %3
+ %5 = fmul <2 x double> undef, %4
+ %6 = extractelement <2 x double> %5, i32 0
+ %cmp168 = fcmp olt double %6, undef
+ br i1 %cmp168, label %if.then172, label %return
+
+if.then172: ; preds = %cond.end90
+ %7 = tail call i64 @llvm.objectsize.i64.p0i8(i8* undef, i1 false)
+ br label %return
+
+return: ; preds = %if.then172, %cond.end90, %entry
+ ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i64 @llvm.objectsize.i64.p0i8(i8*, i1) #1
+
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!1 = metadata !{metadata !2, metadata !2, i64 0}
+!2 = metadata !{metadata !"double", metadata !3, i64 0}
+!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0}
+!4 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll b/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
new file mode 100644
index 0000000..9ee4063
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
@@ -0,0 +1,6174 @@
+; RUN: llc -mtriple=arm64-apple-ios7.0 -o - %s | FileCheck %s
+
+@ptr = global i8* null
+
+define <8 x i8> @test_v8i8_pre_load(<8 x i8>* %addr) {
+; CHECK-LABEL: test_v8i8_pre_load:
+; CHECK: ldr d0, [x0, #40]!
+ %newaddr = getelementptr <8 x i8>* %addr, i32 5
+ %val = load <8 x i8>* %newaddr, align 8
+ store <8 x i8>* %newaddr, <8 x i8>** bitcast(i8** @ptr to <8 x i8>**)
+ ret <8 x i8> %val
+}
+
+define <8 x i8> @test_v8i8_post_load(<8 x i8>* %addr) {
+; CHECK-LABEL: test_v8i8_post_load:
+; CHECK: ldr d0, [x0], #40
+ %newaddr = getelementptr <8 x i8>* %addr, i32 5
+ %val = load <8 x i8>* %addr, align 8
+ store <8 x i8>* %newaddr, <8 x i8>** bitcast(i8** @ptr to <8 x i8>**)
+ ret <8 x i8> %val
+}
+
+define void @test_v8i8_pre_store(<8 x i8> %in, <8 x i8>* %addr) {
+; CHECK-LABEL: test_v8i8_pre_store:
+; CHECK: str d0, [x0, #40]!
+ %newaddr = getelementptr <8 x i8>* %addr, i32 5
+ store <8 x i8> %in, <8 x i8>* %newaddr, align 8
+ store <8 x i8>* %newaddr, <8 x i8>** bitcast(i8** @ptr to <8 x i8>**)
+ ret void
+}
+
+define void @test_v8i8_post_store(<8 x i8> %in, <8 x i8>* %addr) {
+; CHECK-LABEL: test_v8i8_post_store:
+; CHECK: str d0, [x0], #40
+ %newaddr = getelementptr <8 x i8>* %addr, i32 5
+ store <8 x i8> %in, <8 x i8>* %addr, align 8
+ store <8 x i8>* %newaddr, <8 x i8>** bitcast(i8** @ptr to <8 x i8>**)
+ ret void
+}
+
+define <4 x i16> @test_v4i16_pre_load(<4 x i16>* %addr) {
+; CHECK-LABEL: test_v4i16_pre_load:
+; CHECK: ldr d0, [x0, #40]!
+ %newaddr = getelementptr <4 x i16>* %addr, i32 5
+ %val = load <4 x i16>* %newaddr, align 8
+ store <4 x i16>* %newaddr, <4 x i16>** bitcast(i8** @ptr to <4 x i16>**)
+ ret <4 x i16> %val
+}
+
+define <4 x i16> @test_v4i16_post_load(<4 x i16>* %addr) {
+; CHECK-LABEL: test_v4i16_post_load:
+; CHECK: ldr d0, [x0], #40
+ %newaddr = getelementptr <4 x i16>* %addr, i32 5
+ %val = load <4 x i16>* %addr, align 8
+ store <4 x i16>* %newaddr, <4 x i16>** bitcast(i8** @ptr to <4 x i16>**)
+ ret <4 x i16> %val
+}
+
+define void @test_v4i16_pre_store(<4 x i16> %in, <4 x i16>* %addr) {
+; CHECK-LABEL: test_v4i16_pre_store:
+; CHECK: str d0, [x0, #40]!
+ %newaddr = getelementptr <4 x i16>* %addr, i32 5
+ store <4 x i16> %in, <4 x i16>* %newaddr, align 8
+ store <4 x i16>* %newaddr, <4 x i16>** bitcast(i8** @ptr to <4 x i16>**)
+ ret void
+}
+
+define void @test_v4i16_post_store(<4 x i16> %in, <4 x i16>* %addr) {
+; CHECK-LABEL: test_v4i16_post_store:
+; CHECK: str d0, [x0], #40
+ %newaddr = getelementptr <4 x i16>* %addr, i32 5
+ store <4 x i16> %in, <4 x i16>* %addr, align 8
+ store <4 x i16>* %newaddr, <4 x i16>** bitcast(i8** @ptr to <4 x i16>**)
+ ret void
+}
+
+define <2 x i32> @test_v2i32_pre_load(<2 x i32>* %addr) {
+; CHECK-LABEL: test_v2i32_pre_load:
+; CHECK: ldr d0, [x0, #40]!
+ %newaddr = getelementptr <2 x i32>* %addr, i32 5
+ %val = load <2 x i32>* %newaddr, align 8
+ store <2 x i32>* %newaddr, <2 x i32>** bitcast(i8** @ptr to <2 x i32>**)
+ ret <2 x i32> %val
+}
+
+define <2 x i32> @test_v2i32_post_load(<2 x i32>* %addr) {
+; CHECK-LABEL: test_v2i32_post_load:
+; CHECK: ldr d0, [x0], #40
+ %newaddr = getelementptr <2 x i32>* %addr, i32 5
+ %val = load <2 x i32>* %addr, align 8
+ store <2 x i32>* %newaddr, <2 x i32>** bitcast(i8** @ptr to <2 x i32>**)
+ ret <2 x i32> %val
+}
+
+define void @test_v2i32_pre_store(<2 x i32> %in, <2 x i32>* %addr) {
+; CHECK-LABEL: test_v2i32_pre_store:
+; CHECK: str d0, [x0, #40]!
+ %newaddr = getelementptr <2 x i32>* %addr, i32 5
+ store <2 x i32> %in, <2 x i32>* %newaddr, align 8
+ store <2 x i32>* %newaddr, <2 x i32>** bitcast(i8** @ptr to <2 x i32>**)
+ ret void
+}
+
+define void @test_v2i32_post_store(<2 x i32> %in, <2 x i32>* %addr) {
+; CHECK-LABEL: test_v2i32_post_store:
+; CHECK: str d0, [x0], #40
+ %newaddr = getelementptr <2 x i32>* %addr, i32 5
+ store <2 x i32> %in, <2 x i32>* %addr, align 8
+ store <2 x i32>* %newaddr, <2 x i32>** bitcast(i8** @ptr to <2 x i32>**)
+ ret void
+}
+
+define <2 x float> @test_v2f32_pre_load(<2 x float>* %addr) {
+; CHECK-LABEL: test_v2f32_pre_load:
+; CHECK: ldr d0, [x0, #40]!
+ %newaddr = getelementptr <2 x float>* %addr, i32 5
+ %val = load <2 x float>* %newaddr, align 8
+ store <2 x float>* %newaddr, <2 x float>** bitcast(i8** @ptr to <2 x float>**)
+ ret <2 x float> %val
+}
+
+define <2 x float> @test_v2f32_post_load(<2 x float>* %addr) {
+; CHECK-LABEL: test_v2f32_post_load:
+; CHECK: ldr d0, [x0], #40
+ %newaddr = getelementptr <2 x float>* %addr, i32 5
+ %val = load <2 x float>* %addr, align 8
+ store <2 x float>* %newaddr, <2 x float>** bitcast(i8** @ptr to <2 x float>**)
+ ret <2 x float> %val
+}
+
+define void @test_v2f32_pre_store(<2 x float> %in, <2 x float>* %addr) {
+; CHECK-LABEL: test_v2f32_pre_store:
+; CHECK: str d0, [x0, #40]!
+ %newaddr = getelementptr <2 x float>* %addr, i32 5
+ store <2 x float> %in, <2 x float>* %newaddr, align 8
+ store <2 x float>* %newaddr, <2 x float>** bitcast(i8** @ptr to <2 x float>**)
+ ret void
+}
+
+define void @test_v2f32_post_store(<2 x float> %in, <2 x float>* %addr) {
+; CHECK-LABEL: test_v2f32_post_store:
+; CHECK: str d0, [x0], #40
+ %newaddr = getelementptr <2 x float>* %addr, i32 5
+ store <2 x float> %in, <2 x float>* %addr, align 8
+ store <2 x float>* %newaddr, <2 x float>** bitcast(i8** @ptr to <2 x float>**)
+ ret void
+}
+
+define <1 x i64> @test_v1i64_pre_load(<1 x i64>* %addr) {
+; CHECK-LABEL: test_v1i64_pre_load:
+; CHECK: ldr d0, [x0, #40]!
+ %newaddr = getelementptr <1 x i64>* %addr, i32 5
+ %val = load <1 x i64>* %newaddr, align 8
+ store <1 x i64>* %newaddr, <1 x i64>** bitcast(i8** @ptr to <1 x i64>**)
+ ret <1 x i64> %val
+}
+
+define <1 x i64> @test_v1i64_post_load(<1 x i64>* %addr) {
+; CHECK-LABEL: test_v1i64_post_load:
+; CHECK: ldr d0, [x0], #40
+ %newaddr = getelementptr <1 x i64>* %addr, i32 5
+ %val = load <1 x i64>* %addr, align 8
+ store <1 x i64>* %newaddr, <1 x i64>** bitcast(i8** @ptr to <1 x i64>**)
+ ret <1 x i64> %val
+}
+
+define void @test_v1i64_pre_store(<1 x i64> %in, <1 x i64>* %addr) {
+; CHECK-LABEL: test_v1i64_pre_store:
+; CHECK: str d0, [x0, #40]!
+ %newaddr = getelementptr <1 x i64>* %addr, i32 5
+ store <1 x i64> %in, <1 x i64>* %newaddr, align 8
+ store <1 x i64>* %newaddr, <1 x i64>** bitcast(i8** @ptr to <1 x i64>**)
+ ret void
+}
+
+define void @test_v1i64_post_store(<1 x i64> %in, <1 x i64>* %addr) {
+; CHECK-LABEL: test_v1i64_post_store:
+; CHECK: str d0, [x0], #40
+ %newaddr = getelementptr <1 x i64>* %addr, i32 5
+ store <1 x i64> %in, <1 x i64>* %addr, align 8
+ store <1 x i64>* %newaddr, <1 x i64>** bitcast(i8** @ptr to <1 x i64>**)
+ ret void
+}
+
+define <16 x i8> @test_v16i8_pre_load(<16 x i8>* %addr) {
+; CHECK-LABEL: test_v16i8_pre_load:
+; CHECK: ldr q0, [x0, #80]!
+ %newaddr = getelementptr <16 x i8>* %addr, i32 5
+ %val = load <16 x i8>* %newaddr, align 8
+ store <16 x i8>* %newaddr, <16 x i8>** bitcast(i8** @ptr to <16 x i8>**)
+ ret <16 x i8> %val
+}
+
+define <16 x i8> @test_v16i8_post_load(<16 x i8>* %addr) {
+; CHECK-LABEL: test_v16i8_post_load:
+; CHECK: ldr q0, [x0], #80
+ %newaddr = getelementptr <16 x i8>* %addr, i32 5
+ %val = load <16 x i8>* %addr, align 8
+ store <16 x i8>* %newaddr, <16 x i8>** bitcast(i8** @ptr to <16 x i8>**)
+ ret <16 x i8> %val
+}
+
+define void @test_v16i8_pre_store(<16 x i8> %in, <16 x i8>* %addr) {
+; CHECK-LABEL: test_v16i8_pre_store:
+; CHECK: str q0, [x0, #80]!
+ %newaddr = getelementptr <16 x i8>* %addr, i32 5
+ store <16 x i8> %in, <16 x i8>* %newaddr, align 8
+ store <16 x i8>* %newaddr, <16 x i8>** bitcast(i8** @ptr to <16 x i8>**)
+ ret void
+}
+
+define void @test_v16i8_post_store(<16 x i8> %in, <16 x i8>* %addr) {
+; CHECK-LABEL: test_v16i8_post_store:
+; CHECK: str q0, [x0], #80
+ %newaddr = getelementptr <16 x i8>* %addr, i32 5
+ store <16 x i8> %in, <16 x i8>* %addr, align 8
+ store <16 x i8>* %newaddr, <16 x i8>** bitcast(i8** @ptr to <16 x i8>**)
+ ret void
+}
+
+define <8 x i16> @test_v8i16_pre_load(<8 x i16>* %addr) {
+; CHECK-LABEL: test_v8i16_pre_load:
+; CHECK: ldr q0, [x0, #80]!
+ %newaddr = getelementptr <8 x i16>* %addr, i32 5
+ %val = load <8 x i16>* %newaddr, align 8
+ store <8 x i16>* %newaddr, <8 x i16>** bitcast(i8** @ptr to <8 x i16>**)
+ ret <8 x i16> %val
+}
+
+define <8 x i16> @test_v8i16_post_load(<8 x i16>* %addr) {
+; CHECK-LABEL: test_v8i16_post_load:
+; CHECK: ldr q0, [x0], #80
+ %newaddr = getelementptr <8 x i16>* %addr, i32 5
+ %val = load <8 x i16>* %addr, align 8
+ store <8 x i16>* %newaddr, <8 x i16>** bitcast(i8** @ptr to <8 x i16>**)
+ ret <8 x i16> %val
+}
+
+define void @test_v8i16_pre_store(<8 x i16> %in, <8 x i16>* %addr) {
+; CHECK-LABEL: test_v8i16_pre_store:
+; CHECK: str q0, [x0, #80]!
+ %newaddr = getelementptr <8 x i16>* %addr, i32 5
+ store <8 x i16> %in, <8 x i16>* %newaddr, align 8
+ store <8 x i16>* %newaddr, <8 x i16>** bitcast(i8** @ptr to <8 x i16>**)
+ ret void
+}
+
+define void @test_v8i16_post_store(<8 x i16> %in, <8 x i16>* %addr) {
+; CHECK-LABEL: test_v8i16_post_store:
+; CHECK: str q0, [x0], #80
+ %newaddr = getelementptr <8 x i16>* %addr, i32 5
+ store <8 x i16> %in, <8 x i16>* %addr, align 8
+ store <8 x i16>* %newaddr, <8 x i16>** bitcast(i8** @ptr to <8 x i16>**)
+ ret void
+}
+
+define <4 x i32> @test_v4i32_pre_load(<4 x i32>* %addr) {
+; CHECK-LABEL: test_v4i32_pre_load:
+; CHECK: ldr q0, [x0, #80]!
+ %newaddr = getelementptr <4 x i32>* %addr, i32 5
+ %val = load <4 x i32>* %newaddr, align 8
+ store <4 x i32>* %newaddr, <4 x i32>** bitcast(i8** @ptr to <4 x i32>**)
+ ret <4 x i32> %val
+}
+
+define <4 x i32> @test_v4i32_post_load(<4 x i32>* %addr) {
+; CHECK-LABEL: test_v4i32_post_load:
+; CHECK: ldr q0, [x0], #80
+ %newaddr = getelementptr <4 x i32>* %addr, i32 5
+ %val = load <4 x i32>* %addr, align 8
+ store <4 x i32>* %newaddr, <4 x i32>** bitcast(i8** @ptr to <4 x i32>**)
+ ret <4 x i32> %val
+}
+
+define void @test_v4i32_pre_store(<4 x i32> %in, <4 x i32>* %addr) {
+; CHECK-LABEL: test_v4i32_pre_store:
+; CHECK: str q0, [x0, #80]!
+ %newaddr = getelementptr <4 x i32>* %addr, i32 5
+ store <4 x i32> %in, <4 x i32>* %newaddr, align 8
+ store <4 x i32>* %newaddr, <4 x i32>** bitcast(i8** @ptr to <4 x i32>**)
+ ret void
+}
+
+define void @test_v4i32_post_store(<4 x i32> %in, <4 x i32>* %addr) {
+; CHECK-LABEL: test_v4i32_post_store:
+; CHECK: str q0, [x0], #80
+ %newaddr = getelementptr <4 x i32>* %addr, i32 5
+ store <4 x i32> %in, <4 x i32>* %addr, align 8
+ store <4 x i32>* %newaddr, <4 x i32>** bitcast(i8** @ptr to <4 x i32>**)
+ ret void
+}
+
+
+define <4 x float> @test_v4f32_pre_load(<4 x float>* %addr) {
+; CHECK-LABEL: test_v4f32_pre_load:
+; CHECK: ldr q0, [x0, #80]!
+ %newaddr = getelementptr <4 x float>* %addr, i32 5
+ %val = load <4 x float>* %newaddr, align 8
+ store <4 x float>* %newaddr, <4 x float>** bitcast(i8** @ptr to <4 x float>**)
+ ret <4 x float> %val
+}
+
+define <4 x float> @test_v4f32_post_load(<4 x float>* %addr) {
+; CHECK-LABEL: test_v4f32_post_load:
+; CHECK: ldr q0, [x0], #80
+ %newaddr = getelementptr <4 x float>* %addr, i32 5
+ %val = load <4 x float>* %addr, align 8
+ store <4 x float>* %newaddr, <4 x float>** bitcast(i8** @ptr to <4 x float>**)
+ ret <4 x float> %val
+}
+
+define void @test_v4f32_pre_store(<4 x float> %in, <4 x float>* %addr) {
+; CHECK-LABEL: test_v4f32_pre_store:
+; CHECK: str q0, [x0, #80]!
+ %newaddr = getelementptr <4 x float>* %addr, i32 5
+ store <4 x float> %in, <4 x float>* %newaddr, align 8
+ store <4 x float>* %newaddr, <4 x float>** bitcast(i8** @ptr to <4 x float>**)
+ ret void
+}
+
+define void @test_v4f32_post_store(<4 x float> %in, <4 x float>* %addr) {
+; CHECK-LABEL: test_v4f32_post_store:
+; CHECK: str q0, [x0], #80
+ %newaddr = getelementptr <4 x float>* %addr, i32 5
+ store <4 x float> %in, <4 x float>* %addr, align 8
+ store <4 x float>* %newaddr, <4 x float>** bitcast(i8** @ptr to <4 x float>**)
+ ret void
+}
+
+
+define <2 x i64> @test_v2i64_pre_load(<2 x i64>* %addr) {
+; CHECK-LABEL: test_v2i64_pre_load:
+; CHECK: ldr q0, [x0, #80]!
+ %newaddr = getelementptr <2 x i64>* %addr, i32 5
+ %val = load <2 x i64>* %newaddr, align 8
+ store <2 x i64>* %newaddr, <2 x i64>** bitcast(i8** @ptr to <2 x i64>**)
+ ret <2 x i64> %val
+}
+
+define <2 x i64> @test_v2i64_post_load(<2 x i64>* %addr) {
+; CHECK-LABEL: test_v2i64_post_load:
+; CHECK: ldr q0, [x0], #80
+ %newaddr = getelementptr <2 x i64>* %addr, i32 5
+ %val = load <2 x i64>* %addr, align 8
+ store <2 x i64>* %newaddr, <2 x i64>** bitcast(i8** @ptr to <2 x i64>**)
+ ret <2 x i64> %val
+}
+
+define void @test_v2i64_pre_store(<2 x i64> %in, <2 x i64>* %addr) {
+; CHECK-LABEL: test_v2i64_pre_store:
+; CHECK: str q0, [x0, #80]!
+ %newaddr = getelementptr <2 x i64>* %addr, i32 5
+ store <2 x i64> %in, <2 x i64>* %newaddr, align 8
+ store <2 x i64>* %newaddr, <2 x i64>** bitcast(i8** @ptr to <2 x i64>**)
+ ret void
+}
+
+define void @test_v2i64_post_store(<2 x i64> %in, <2 x i64>* %addr) {
+; CHECK-LABEL: test_v2i64_post_store:
+; CHECK: str q0, [x0], #80
+ %newaddr = getelementptr <2 x i64>* %addr, i32 5
+ store <2 x i64> %in, <2 x i64>* %addr, align 8
+ store <2 x i64>* %newaddr, <2 x i64>** bitcast(i8** @ptr to <2 x i64>**)
+ ret void
+}
+
+
+define <2 x double> @test_v2f64_pre_load(<2 x double>* %addr) {
+; CHECK-LABEL: test_v2f64_pre_load:
+; CHECK: ldr q0, [x0, #80]!
+ %newaddr = getelementptr <2 x double>* %addr, i32 5
+ %val = load <2 x double>* %newaddr, align 8
+ store <2 x double>* %newaddr, <2 x double>** bitcast(i8** @ptr to <2 x double>**)
+ ret <2 x double> %val
+}
+
+define <2 x double> @test_v2f64_post_load(<2 x double>* %addr) {
+; CHECK-LABEL: test_v2f64_post_load:
+; CHECK: ldr q0, [x0], #80
+ %newaddr = getelementptr <2 x double>* %addr, i32 5
+ %val = load <2 x double>* %addr, align 8
+ store <2 x double>* %newaddr, <2 x double>** bitcast(i8** @ptr to <2 x double>**)
+ ret <2 x double> %val
+}
+
+define void @test_v2f64_pre_store(<2 x double> %in, <2 x double>* %addr) {
+; CHECK-LABEL: test_v2f64_pre_store:
+; CHECK: str q0, [x0, #80]!
+ %newaddr = getelementptr <2 x double>* %addr, i32 5
+ store <2 x double> %in, <2 x double>* %newaddr, align 8
+ store <2 x double>* %newaddr, <2 x double>** bitcast(i8** @ptr to <2 x double>**)
+ ret void
+}
+
+define void @test_v2f64_post_store(<2 x double> %in, <2 x double>* %addr) {
+; CHECK-LABEL: test_v2f64_post_store:
+; CHECK: str q0, [x0], #80
+ %newaddr = getelementptr <2 x double>* %addr, i32 5
+ store <2 x double> %in, <2 x double>* %addr, align 8
+ store <2 x double>* %newaddr, <2 x double>** bitcast(i8** @ptr to <2 x double>**)
+ ret void
+}
+
+define i8* @test_v16i8_post_imm_st1_lane(<16 x i8> %in, i8* %addr) {
+; CHECK-LABEL: test_v16i8_post_imm_st1_lane:
+; CHECK: st1.b { v0 }[3], [x0], #1
+ %elt = extractelement <16 x i8> %in, i32 3
+ store i8 %elt, i8* %addr
+
+ %newaddr = getelementptr i8* %addr, i32 1
+ ret i8* %newaddr
+}
+
+define i8* @test_v16i8_post_reg_st1_lane(<16 x i8> %in, i8* %addr) {
+; CHECK-LABEL: test_v16i8_post_reg_st1_lane:
+; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x2
+; CHECK: st1.b { v0 }[3], [x0], x[[OFFSET]]
+ %elt = extractelement <16 x i8> %in, i32 3
+ store i8 %elt, i8* %addr
+
+ %newaddr = getelementptr i8* %addr, i32 2
+ ret i8* %newaddr
+}
+
+
+define i16* @test_v8i16_post_imm_st1_lane(<8 x i16> %in, i16* %addr) {
+; CHECK-LABEL: test_v8i16_post_imm_st1_lane:
+; CHECK: st1.h { v0 }[3], [x0], #2
+ %elt = extractelement <8 x i16> %in, i32 3
+ store i16 %elt, i16* %addr
+
+ %newaddr = getelementptr i16* %addr, i32 1
+ ret i16* %newaddr
+}
+
+define i16* @test_v8i16_post_reg_st1_lane(<8 x i16> %in, i16* %addr) {
+; CHECK-LABEL: test_v8i16_post_reg_st1_lane:
+; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x4
+; CHECK: st1.h { v0 }[3], [x0], x[[OFFSET]]
+ %elt = extractelement <8 x i16> %in, i32 3
+ store i16 %elt, i16* %addr
+
+ %newaddr = getelementptr i16* %addr, i32 2
+ ret i16* %newaddr
+}
+
+define i32* @test_v4i32_post_imm_st1_lane(<4 x i32> %in, i32* %addr) {
+; CHECK-LABEL: test_v4i32_post_imm_st1_lane:
+; CHECK: st1.s { v0 }[3], [x0], #4
+ %elt = extractelement <4 x i32> %in, i32 3
+ store i32 %elt, i32* %addr
+
+ %newaddr = getelementptr i32* %addr, i32 1
+ ret i32* %newaddr
+}
+
+define i32* @test_v4i32_post_reg_st1_lane(<4 x i32> %in, i32* %addr) {
+; CHECK-LABEL: test_v4i32_post_reg_st1_lane:
+; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x8
+; CHECK: st1.s { v0 }[3], [x0], x[[OFFSET]]
+ %elt = extractelement <4 x i32> %in, i32 3
+ store i32 %elt, i32* %addr
+
+ %newaddr = getelementptr i32* %addr, i32 2
+ ret i32* %newaddr
+}
+
+define float* @test_v4f32_post_imm_st1_lane(<4 x float> %in, float* %addr) {
+; CHECK-LABEL: test_v4f32_post_imm_st1_lane:
+; CHECK: st1.s { v0 }[3], [x0], #4
+ %elt = extractelement <4 x float> %in, i32 3
+ store float %elt, float* %addr
+
+ %newaddr = getelementptr float* %addr, i32 1
+ ret float* %newaddr
+}
+
+define float* @test_v4f32_post_reg_st1_lane(<4 x float> %in, float* %addr) {
+; CHECK-LABEL: test_v4f32_post_reg_st1_lane:
+; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x8
+; CHECK: st1.s { v0 }[3], [x0], x[[OFFSET]]
+ %elt = extractelement <4 x float> %in, i32 3
+ store float %elt, float* %addr
+
+ %newaddr = getelementptr float* %addr, i32 2
+ ret float* %newaddr
+}
+
+define i64* @test_v2i64_post_imm_st1_lane(<2 x i64> %in, i64* %addr) {
+; CHECK-LABEL: test_v2i64_post_imm_st1_lane:
+; CHECK: st1.d { v0 }[1], [x0], #8
+ %elt = extractelement <2 x i64> %in, i64 1
+ store i64 %elt, i64* %addr
+
+ %newaddr = getelementptr i64* %addr, i64 1
+ ret i64* %newaddr
+}
+
+define i64* @test_v2i64_post_reg_st1_lane(<2 x i64> %in, i64* %addr) {
+; CHECK-LABEL: test_v2i64_post_reg_st1_lane:
+; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x10
+; CHECK: st1.d { v0 }[1], [x0], x[[OFFSET]]
+ %elt = extractelement <2 x i64> %in, i64 1
+ store i64 %elt, i64* %addr
+
+ %newaddr = getelementptr i64* %addr, i64 2
+ ret i64* %newaddr
+}
+
+define double* @test_v2f64_post_imm_st1_lane(<2 x double> %in, double* %addr) {
+; CHECK-LABEL: test_v2f64_post_imm_st1_lane:
+; CHECK: st1.d { v0 }[1], [x0], #8
+ %elt = extractelement <2 x double> %in, i32 1
+ store double %elt, double* %addr
+
+ %newaddr = getelementptr double* %addr, i32 1
+ ret double* %newaddr
+}
+
+define double* @test_v2f64_post_reg_st1_lane(<2 x double> %in, double* %addr) {
+; CHECK-LABEL: test_v2f64_post_reg_st1_lane:
+; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x10
+; CHECK: st1.d { v0 }[1], [x0], x[[OFFSET]]
+ %elt = extractelement <2 x double> %in, i32 1
+ store double %elt, double* %addr
+
+ %newaddr = getelementptr double* %addr, i32 2
+ ret double* %newaddr
+}
+
+define i8* @test_v8i8_post_imm_st1_lane(<8 x i8> %in, i8* %addr) {
+; CHECK-LABEL: test_v8i8_post_imm_st1_lane:
+; CHECK: st1.b { v0 }[3], [x0], #1
+ %elt = extractelement <8 x i8> %in, i32 3
+ store i8 %elt, i8* %addr
+
+ %newaddr = getelementptr i8* %addr, i32 1
+ ret i8* %newaddr
+}
+
+define i8* @test_v8i8_post_reg_st1_lane(<8 x i8> %in, i8* %addr) {
+; CHECK-LABEL: test_v8i8_post_reg_st1_lane:
+; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x2
+; CHECK: st1.b { v0 }[3], [x0], x[[OFFSET]]
+ %elt = extractelement <8 x i8> %in, i32 3
+ store i8 %elt, i8* %addr
+
+ %newaddr = getelementptr i8* %addr, i32 2
+ ret i8* %newaddr
+}
+
+define i16* @test_v4i16_post_imm_st1_lane(<4 x i16> %in, i16* %addr) {
+; CHECK-LABEL: test_v4i16_post_imm_st1_lane:
+; CHECK: st1.h { v0 }[3], [x0], #2
+ %elt = extractelement <4 x i16> %in, i32 3
+ store i16 %elt, i16* %addr
+
+ %newaddr = getelementptr i16* %addr, i32 1
+ ret i16* %newaddr
+}
+
+define i16* @test_v4i16_post_reg_st1_lane(<4 x i16> %in, i16* %addr) {
+; CHECK-LABEL: test_v4i16_post_reg_st1_lane:
+; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x4
+; CHECK: st1.h { v0 }[3], [x0], x[[OFFSET]]
+ %elt = extractelement <4 x i16> %in, i32 3
+ store i16 %elt, i16* %addr
+
+ %newaddr = getelementptr i16* %addr, i32 2
+ ret i16* %newaddr
+}
+
+define i32* @test_v2i32_post_imm_st1_lane(<2 x i32> %in, i32* %addr) {
+; CHECK-LABEL: test_v2i32_post_imm_st1_lane:
+; CHECK: st1.s { v0 }[1], [x0], #4
+ %elt = extractelement <2 x i32> %in, i32 1
+ store i32 %elt, i32* %addr
+
+ %newaddr = getelementptr i32* %addr, i32 1
+ ret i32* %newaddr
+}
+
+define i32* @test_v2i32_post_reg_st1_lane(<2 x i32> %in, i32* %addr) {
+; CHECK-LABEL: test_v2i32_post_reg_st1_lane:
+; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x8
+; CHECK: st1.s { v0 }[1], [x0], x[[OFFSET]]
+ %elt = extractelement <2 x i32> %in, i32 1
+ store i32 %elt, i32* %addr
+
+ %newaddr = getelementptr i32* %addr, i32 2
+ ret i32* %newaddr
+}
+
+define float* @test_v2f32_post_imm_st1_lane(<2 x float> %in, float* %addr) {
+; CHECK-LABEL: test_v2f32_post_imm_st1_lane:
+; CHECK: st1.s { v0 }[1], [x0], #4
+ %elt = extractelement <2 x float> %in, i32 1
+ store float %elt, float* %addr
+
+ %newaddr = getelementptr float* %addr, i32 1
+ ret float* %newaddr
+}
+
+define float* @test_v2f32_post_reg_st1_lane(<2 x float> %in, float* %addr) {
+; CHECK-LABEL: test_v2f32_post_reg_st1_lane:
+; CHECK: orr w[[OFFSET:[0-9]+]], wzr, #0x8
+; CHECK: st1.s { v0 }[1], [x0], x[[OFFSET]]
+ %elt = extractelement <2 x float> %in, i32 1
+ store float %elt, float* %addr
+
+ %newaddr = getelementptr float* %addr, i32 2
+ ret float* %newaddr
+}
+
+define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld2(i8* %A, i8** %ptr) {
+;CHECK-LABEL: test_v16i8_post_imm_ld2:
+;CHECK: ld2.16b { v0, v1 }, [x0], #32
+ %ld2 = tail call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0i8(i8* %A)
+ %tmp = getelementptr i8* %A, i32 32
+ store i8* %tmp, i8** %ptr
+ ret { <16 x i8>, <16 x i8> } %ld2
+}
+
+define { <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld2(i8* %A, i8** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v16i8_post_reg_ld2:
+;CHECK: ld2.16b { v0, v1 }, [x0], x{{[0-9]+}}
+ %ld2 = tail call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0i8(i8* %A)
+ %tmp = getelementptr i8* %A, i64 %inc
+ store i8* %tmp, i8** %ptr
+ ret { <16 x i8>, <16 x i8> } %ld2
+}
+
+declare { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0i8(i8*)
+
+
+define { <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld2(i8* %A, i8** %ptr) {
+;CHECK-LABEL: test_v8i8_post_imm_ld2:
+;CHECK: ld2.8b { v0, v1 }, [x0], #16
+ %ld2 = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0i8(i8* %A)
+ %tmp = getelementptr i8* %A, i32 16
+ store i8* %tmp, i8** %ptr
+ ret { <8 x i8>, <8 x i8> } %ld2
+}
+
+define { <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld2(i8* %A, i8** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v8i8_post_reg_ld2:
+;CHECK: ld2.8b { v0, v1 }, [x0], x{{[0-9]+}}
+ %ld2 = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0i8(i8* %A)
+ %tmp = getelementptr i8* %A, i64 %inc
+ store i8* %tmp, i8** %ptr
+ ret { <8 x i8>, <8 x i8> } %ld2
+}
+
+declare { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0i8(i8*)
+
+
+define { <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld2(i16* %A, i16** %ptr) {
+;CHECK-LABEL: test_v8i16_post_imm_ld2:
+;CHECK: ld2.8h { v0, v1 }, [x0], #32
+ %ld2 = tail call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0i16(i16* %A)
+ %tmp = getelementptr i16* %A, i32 16
+ store i16* %tmp, i16** %ptr
+ ret { <8 x i16>, <8 x i16> } %ld2
+}
+
+define { <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld2(i16* %A, i16** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v8i16_post_reg_ld2:
+;CHECK: ld2.8h { v0, v1 }, [x0], x{{[0-9]+}}
+ %ld2 = tail call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0i16(i16* %A)
+ %tmp = getelementptr i16* %A, i64 %inc
+ store i16* %tmp, i16** %ptr
+ ret { <8 x i16>, <8 x i16> } %ld2
+}
+
+declare { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0i16(i16*)
+
+
+define { <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld2(i16* %A, i16** %ptr) {
+;CHECK-LABEL: test_v4i16_post_imm_ld2:
+;CHECK: ld2.4h { v0, v1 }, [x0], #16
+ %ld2 = tail call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0i16(i16* %A)
+ %tmp = getelementptr i16* %A, i32 8
+ store i16* %tmp, i16** %ptr
+ ret { <4 x i16>, <4 x i16> } %ld2
+}
+
+define { <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld2(i16* %A, i16** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4i16_post_reg_ld2:
+;CHECK: ld2.4h { v0, v1 }, [x0], x{{[0-9]+}}
+ %ld2 = tail call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0i16(i16* %A)
+ %tmp = getelementptr i16* %A, i64 %inc
+ store i16* %tmp, i16** %ptr
+ ret { <4 x i16>, <4 x i16> } %ld2
+}
+
+declare { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0i16(i16*)
+
+
+define { <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld2(i32* %A, i32** %ptr) {
+;CHECK-LABEL: test_v4i32_post_imm_ld2:
+;CHECK: ld2.4s { v0, v1 }, [x0], #32
+ %ld2 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i32(i32* %A)
+ %tmp = getelementptr i32* %A, i32 8
+ store i32* %tmp, i32** %ptr
+ ret { <4 x i32>, <4 x i32> } %ld2
+}
+
+define { <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld2(i32* %A, i32** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4i32_post_reg_ld2:
+;CHECK: ld2.4s { v0, v1 }, [x0], x{{[0-9]+}}
+ %ld2 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i32(i32* %A)
+ %tmp = getelementptr i32* %A, i64 %inc
+ store i32* %tmp, i32** %ptr
+ ret { <4 x i32>, <4 x i32> } %ld2
+}
+
+declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i32(i32*)
+
+
+define { <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld2(i32* %A, i32** %ptr) {
+;CHECK-LABEL: test_v2i32_post_imm_ld2:
+;CHECK: ld2.2s { v0, v1 }, [x0], #16
+ %ld2 = tail call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0i32(i32* %A)
+ %tmp = getelementptr i32* %A, i32 4
+ store i32* %tmp, i32** %ptr
+ ret { <2 x i32>, <2 x i32> } %ld2
+}
+
+define { <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld2(i32* %A, i32** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2i32_post_reg_ld2:
+;CHECK: ld2.2s { v0, v1 }, [x0], x{{[0-9]+}}
+ %ld2 = tail call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0i32(i32* %A)
+ %tmp = getelementptr i32* %A, i64 %inc
+ store i32* %tmp, i32** %ptr
+ ret { <2 x i32>, <2 x i32> } %ld2
+}
+
+declare { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0i32(i32*)
+
+
+define { <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld2(i64* %A, i64** %ptr) {
+;CHECK-LABEL: test_v2i64_post_imm_ld2:
+;CHECK: ld2.2d { v0, v1 }, [x0], #32
+ %ld2 = tail call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0i64(i64* %A)
+ %tmp = getelementptr i64* %A, i32 4
+ store i64* %tmp, i64** %ptr
+ ret { <2 x i64>, <2 x i64> } %ld2
+}
+
+define { <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld2(i64* %A, i64** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2i64_post_reg_ld2:
+;CHECK: ld2.2d { v0, v1 }, [x0], x{{[0-9]+}}
+ %ld2 = tail call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0i64(i64* %A)
+ %tmp = getelementptr i64* %A, i64 %inc
+ store i64* %tmp, i64** %ptr
+ ret { <2 x i64>, <2 x i64> } %ld2
+}
+
+declare { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0i64(i64*)
+
+
+define { <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld2(i64* %A, i64** %ptr) {
+;CHECK-LABEL: test_v1i64_post_imm_ld2:
+;CHECK: ld1.1d { v0, v1 }, [x0], #16
+ %ld2 = tail call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0i64(i64* %A)
+ %tmp = getelementptr i64* %A, i32 2
+ store i64* %tmp, i64** %ptr
+ ret { <1 x i64>, <1 x i64> } %ld2
+}
+
+define { <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld2(i64* %A, i64** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v1i64_post_reg_ld2:
+;CHECK: ld1.1d { v0, v1 }, [x0], x{{[0-9]+}}
+ %ld2 = tail call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0i64(i64* %A)
+ %tmp = getelementptr i64* %A, i64 %inc
+ store i64* %tmp, i64** %ptr
+ ret { <1 x i64>, <1 x i64> } %ld2
+}
+
+declare { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0i64(i64*)
+
+
+define { <4 x float>, <4 x float> } @test_v4f32_post_imm_ld2(float* %A, float** %ptr) {
+;CHECK-LABEL: test_v4f32_post_imm_ld2:
+;CHECK: ld2.4s { v0, v1 }, [x0], #32
+ %ld2 = tail call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0f32(float* %A)
+ %tmp = getelementptr float* %A, i32 8
+ store float* %tmp, float** %ptr
+ ret { <4 x float>, <4 x float> } %ld2
+}
+
+define { <4 x float>, <4 x float> } @test_v4f32_post_reg_ld2(float* %A, float** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4f32_post_reg_ld2:
+;CHECK: ld2.4s { v0, v1 }, [x0], x{{[0-9]+}}
+ %ld2 = tail call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0f32(float* %A)
+ %tmp = getelementptr float* %A, i64 %inc
+ store float* %tmp, float** %ptr
+ ret { <4 x float>, <4 x float> } %ld2
+}
+
+declare { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0f32(float*)
+
+
+define { <2 x float>, <2 x float> } @test_v2f32_post_imm_ld2(float* %A, float** %ptr) {
+;CHECK-LABEL: test_v2f32_post_imm_ld2:
+;CHECK: ld2.2s { v0, v1 }, [x0], #16
+ %ld2 = tail call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2.v2f32.p0f32(float* %A)
+ %tmp = getelementptr float* %A, i32 4
+ store float* %tmp, float** %ptr
+ ret { <2 x float>, <2 x float> } %ld2
+}
+
+define { <2 x float>, <2 x float> } @test_v2f32_post_reg_ld2(float* %A, float** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2f32_post_reg_ld2:
+;CHECK: ld2.2s { v0, v1 }, [x0], x{{[0-9]+}}
+ %ld2 = tail call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2.v2f32.p0f32(float* %A)
+ %tmp = getelementptr float* %A, i64 %inc
+ store float* %tmp, float** %ptr
+ ret { <2 x float>, <2 x float> } %ld2
+}
+
+declare { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2.v2f32.p0f32(float*)
+
+
+define { <2 x double>, <2 x double> } @test_v2f64_post_imm_ld2(double* %A, double** %ptr) {
+;CHECK-LABEL: test_v2f64_post_imm_ld2:
+;CHECK: ld2.2d { v0, v1 }, [x0], #32
+ %ld2 = tail call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2.v2f64.p0f64(double* %A)
+ %tmp = getelementptr double* %A, i32 4
+ store double* %tmp, double** %ptr
+ ret { <2 x double>, <2 x double> } %ld2
+}
+
+define { <2 x double>, <2 x double> } @test_v2f64_post_reg_ld2(double* %A, double** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2f64_post_reg_ld2:
+;CHECK: ld2.2d { v0, v1 }, [x0], x{{[0-9]+}}
+ %ld2 = tail call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2.v2f64.p0f64(double* %A)
+ %tmp = getelementptr double* %A, i64 %inc
+ store double* %tmp, double** %ptr
+ ret { <2 x double>, <2 x double> } %ld2
+}
+
+declare { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2.v2f64.p0f64(double*)
+
+
+define { <1 x double>, <1 x double> } @test_v1f64_post_imm_ld2(double* %A, double** %ptr) {
+;CHECK-LABEL: test_v1f64_post_imm_ld2:
+;CHECK: ld1.1d { v0, v1 }, [x0], #16
+ %ld2 = tail call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2.v1f64.p0f64(double* %A)
+ %tmp = getelementptr double* %A, i32 2
+ store double* %tmp, double** %ptr
+ ret { <1 x double>, <1 x double> } %ld2
+}
+
+define { <1 x double>, <1 x double> } @test_v1f64_post_reg_ld2(double* %A, double** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v1f64_post_reg_ld2:
+;CHECK: ld1.1d { v0, v1 }, [x0], x{{[0-9]+}}
+ %ld2 = tail call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2.v1f64.p0f64(double* %A)
+ %tmp = getelementptr double* %A, i64 %inc
+ store double* %tmp, double** %ptr
+ ret { <1 x double>, <1 x double> } %ld2
+}
+
+declare { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2.v1f64.p0f64(double*)
+
+
+define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld3(i8* %A, i8** %ptr) {
+;CHECK-LABEL: test_v16i8_post_imm_ld3:
+;CHECK: ld3.16b { v0, v1, v2 }, [x0], #48
+ %ld3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0i8(i8* %A)
+ %tmp = getelementptr i8* %A, i32 48
+ store i8* %tmp, i8** %ptr
+ ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3
+}
+
+define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld3(i8* %A, i8** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v16i8_post_reg_ld3:
+;CHECK: ld3.16b { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ %ld3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0i8(i8* %A)
+ %tmp = getelementptr i8* %A, i64 %inc
+ store i8* %tmp, i8** %ptr
+ ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3
+}
+
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0i8(i8*)
+
+
+define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld3(i8* %A, i8** %ptr) {
+;CHECK-LABEL: test_v8i8_post_imm_ld3:
+;CHECK: ld3.8b { v0, v1, v2 }, [x0], #24
+ %ld3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0i8(i8* %A)
+ %tmp = getelementptr i8* %A, i32 24
+ store i8* %tmp, i8** %ptr
+ ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3
+}
+
+define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld3(i8* %A, i8** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v8i8_post_reg_ld3:
+;CHECK: ld3.8b { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ %ld3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0i8(i8* %A)
+ %tmp = getelementptr i8* %A, i64 %inc
+ store i8* %tmp, i8** %ptr
+ ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3
+}
+
+declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0i8(i8*)
+
+
+define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld3(i16* %A, i16** %ptr) {
+;CHECK-LABEL: test_v8i16_post_imm_ld3:
+;CHECK: ld3.8h { v0, v1, v2 }, [x0], #48
+ %ld3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0i16(i16* %A)
+ %tmp = getelementptr i16* %A, i32 24
+ store i16* %tmp, i16** %ptr
+ ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3
+}
+
+define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld3(i16* %A, i16** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v8i16_post_reg_ld3:
+;CHECK: ld3.8h { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ %ld3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0i16(i16* %A)
+ %tmp = getelementptr i16* %A, i64 %inc
+ store i16* %tmp, i16** %ptr
+ ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3
+}
+
+declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0i16(i16*)
+
+
+define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld3(i16* %A, i16** %ptr) {
+;CHECK-LABEL: test_v4i16_post_imm_ld3:
+;CHECK: ld3.4h { v0, v1, v2 }, [x0], #24
+ %ld3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0i16(i16* %A)
+ %tmp = getelementptr i16* %A, i32 12
+ store i16* %tmp, i16** %ptr
+ ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3
+}
+
+define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld3(i16* %A, i16** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4i16_post_reg_ld3:
+;CHECK: ld3.4h { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ %ld3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0i16(i16* %A)
+ %tmp = getelementptr i16* %A, i64 %inc
+ store i16* %tmp, i16** %ptr
+ ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3
+}
+
+declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0i16(i16*)
+
+
+define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld3(i32* %A, i32** %ptr) {
+;CHECK-LABEL: test_v4i32_post_imm_ld3:
+;CHECK: ld3.4s { v0, v1, v2 }, [x0], #48
+ %ld3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i32(i32* %A)
+ %tmp = getelementptr i32* %A, i32 12
+ store i32* %tmp, i32** %ptr
+ ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3
+}
+
+define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld3(i32* %A, i32** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4i32_post_reg_ld3:
+;CHECK: ld3.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ %ld3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i32(i32* %A)
+ %tmp = getelementptr i32* %A, i64 %inc
+ store i32* %tmp, i32** %ptr
+ ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3
+}
+
+declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i32(i32*)
+
+
+define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld3(i32* %A, i32** %ptr) {
+;CHECK-LABEL: test_v2i32_post_imm_ld3:
+;CHECK: ld3.2s { v0, v1, v2 }, [x0], #24
+ %ld3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0i32(i32* %A)
+ %tmp = getelementptr i32* %A, i32 6
+ store i32* %tmp, i32** %ptr
+ ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3
+}
+
+define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld3(i32* %A, i32** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2i32_post_reg_ld3:
+;CHECK: ld3.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ %ld3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0i32(i32* %A)
+ %tmp = getelementptr i32* %A, i64 %inc
+ store i32* %tmp, i32** %ptr
+ ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3
+}
+
+declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0i32(i32*)
+
+
+define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld3(i64* %A, i64** %ptr) {
+;CHECK-LABEL: test_v2i64_post_imm_ld3:
+;CHECK: ld3.2d { v0, v1, v2 }, [x0], #48
+ %ld3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0i64(i64* %A)
+ %tmp = getelementptr i64* %A, i32 6
+ store i64* %tmp, i64** %ptr
+ ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3
+}
+
+define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld3(i64* %A, i64** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2i64_post_reg_ld3:
+;CHECK: ld3.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ %ld3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0i64(i64* %A)
+ %tmp = getelementptr i64* %A, i64 %inc
+ store i64* %tmp, i64** %ptr
+ ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3
+}
+
+declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0i64(i64*)
+
+
+define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld3(i64* %A, i64** %ptr) {
+;CHECK-LABEL: test_v1i64_post_imm_ld3:
+;CHECK: ld1.1d { v0, v1, v2 }, [x0], #24
+ %ld3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0i64(i64* %A)
+ %tmp = getelementptr i64* %A, i32 3
+ store i64* %tmp, i64** %ptr
+ ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3
+}
+
+define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld3(i64* %A, i64** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v1i64_post_reg_ld3:
+;CHECK: ld1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ %ld3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0i64(i64* %A)
+ %tmp = getelementptr i64* %A, i64 %inc
+ store i64* %tmp, i64** %ptr
+ ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3
+}
+
+declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0i64(i64*)
+
+
+define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld3(float* %A, float** %ptr) {
+;CHECK-LABEL: test_v4f32_post_imm_ld3:
+;CHECK: ld3.4s { v0, v1, v2 }, [x0], #48
+ %ld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0f32(float* %A)
+ %tmp = getelementptr float* %A, i32 12
+ store float* %tmp, float** %ptr
+ ret { <4 x float>, <4 x float>, <4 x float> } %ld3
+}
+
+define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld3(float* %A, float** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4f32_post_reg_ld3:
+;CHECK: ld3.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ %ld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0f32(float* %A)
+ %tmp = getelementptr float* %A, i64 %inc
+ store float* %tmp, float** %ptr
+ ret { <4 x float>, <4 x float>, <4 x float> } %ld3
+}
+
+declare { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0f32(float*)
+
+
+define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld3(float* %A, float** %ptr) {
+;CHECK-LABEL: test_v2f32_post_imm_ld3:
+;CHECK: ld3.2s { v0, v1, v2 }, [x0], #24
+ %ld3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3.v2f32.p0f32(float* %A)
+ %tmp = getelementptr float* %A, i32 6
+ store float* %tmp, float** %ptr
+ ret { <2 x float>, <2 x float>, <2 x float> } %ld3
+}
+
+define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld3(float* %A, float** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2f32_post_reg_ld3:
+;CHECK: ld3.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ %ld3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3.v2f32.p0f32(float* %A)
+ %tmp = getelementptr float* %A, i64 %inc
+ store float* %tmp, float** %ptr
+ ret { <2 x float>, <2 x float>, <2 x float> } %ld3
+}
+
+declare { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3.v2f32.p0f32(float*)
+
+
+define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld3(double* %A, double** %ptr) {
+;CHECK-LABEL: test_v2f64_post_imm_ld3:
+;CHECK: ld3.2d { v0, v1, v2 }, [x0], #48
+ %ld3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3.v2f64.p0f64(double* %A)
+ %tmp = getelementptr double* %A, i32 6
+ store double* %tmp, double** %ptr
+ ret { <2 x double>, <2 x double>, <2 x double> } %ld3
+}
+
+define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld3(double* %A, double** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2f64_post_reg_ld3:
+;CHECK: ld3.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ %ld3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3.v2f64.p0f64(double* %A)
+ %tmp = getelementptr double* %A, i64 %inc
+ store double* %tmp, double** %ptr
+ ret { <2 x double>, <2 x double>, <2 x double> } %ld3
+}
+
+declare { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3.v2f64.p0f64(double*)
+
+
+define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld3(double* %A, double** %ptr) {
+;CHECK-LABEL: test_v1f64_post_imm_ld3:
+;CHECK: ld1.1d { v0, v1, v2 }, [x0], #24
+ %ld3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3.v1f64.p0f64(double* %A)
+ %tmp = getelementptr double* %A, i32 3
+ store double* %tmp, double** %ptr
+ ret { <1 x double>, <1 x double>, <1 x double> } %ld3
+}
+
+define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld3(double* %A, double** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v1f64_post_reg_ld3:
+;CHECK: ld1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ %ld3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3.v1f64.p0f64(double* %A)
+ %tmp = getelementptr double* %A, i64 %inc
+ store double* %tmp, double** %ptr
+ ret { <1 x double>, <1 x double>, <1 x double> } %ld3
+}
+
+declare { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3.v1f64.p0f64(double*)
+
+
+define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld4(i8* %A, i8** %ptr) {
+;CHECK-LABEL: test_v16i8_post_imm_ld4:
+;CHECK: ld4.16b { v0, v1, v2, v3 }, [x0], #64
+ %ld4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0i8(i8* %A)
+ %tmp = getelementptr i8* %A, i32 64
+ store i8* %tmp, i8** %ptr
+ ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4
+}
+
+define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld4(i8* %A, i8** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v16i8_post_reg_ld4:
+;CHECK: ld4.16b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ %ld4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0i8(i8* %A)
+ %tmp = getelementptr i8* %A, i64 %inc
+ store i8* %tmp, i8** %ptr
+ ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4
+}
+
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0i8(i8*)
+
+
+define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld4(i8* %A, i8** %ptr) {
+;CHECK-LABEL: test_v8i8_post_imm_ld4:
+;CHECK: ld4.8b { v0, v1, v2, v3 }, [x0], #32
+ %ld4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0i8(i8* %A)
+ %tmp = getelementptr i8* %A, i32 32
+ store i8* %tmp, i8** %ptr
+ ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4
+}
+
+define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld4(i8* %A, i8** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v8i8_post_reg_ld4:
+;CHECK: ld4.8b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ %ld4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0i8(i8* %A)
+ %tmp = getelementptr i8* %A, i64 %inc
+ store i8* %tmp, i8** %ptr
+ ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4
+}
+
+declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0i8(i8*)
+
+
+define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld4(i16* %A, i16** %ptr) {
+;CHECK-LABEL: test_v8i16_post_imm_ld4:
+;CHECK: ld4.8h { v0, v1, v2, v3 }, [x0], #64
+ %ld4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0i16(i16* %A)
+ %tmp = getelementptr i16* %A, i32 32
+ store i16* %tmp, i16** %ptr
+ ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4
+}
+
+define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld4(i16* %A, i16** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v8i16_post_reg_ld4:
+;CHECK: ld4.8h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ %ld4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0i16(i16* %A)
+ %tmp = getelementptr i16* %A, i64 %inc
+ store i16* %tmp, i16** %ptr
+ ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4
+}
+
+declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0i16(i16*)
+
+
+define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld4(i16* %A, i16** %ptr) {
+;CHECK-LABEL: test_v4i16_post_imm_ld4:
+;CHECK: ld4.4h { v0, v1, v2, v3 }, [x0], #32
+ %ld4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0i16(i16* %A)
+ %tmp = getelementptr i16* %A, i32 16
+ store i16* %tmp, i16** %ptr
+ ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4
+}
+
+define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld4(i16* %A, i16** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4i16_post_reg_ld4:
+;CHECK: ld4.4h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ %ld4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0i16(i16* %A)
+ %tmp = getelementptr i16* %A, i64 %inc
+ store i16* %tmp, i16** %ptr
+ ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4
+}
+
+declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0i16(i16*)
+
+
+define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld4(i32* %A, i32** %ptr) {
+;CHECK-LABEL: test_v4i32_post_imm_ld4:
+;CHECK: ld4.4s { v0, v1, v2, v3 }, [x0], #64
+ %ld4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0i32(i32* %A)
+ %tmp = getelementptr i32* %A, i32 16
+ store i32* %tmp, i32** %ptr
+ ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4
+}
+
+define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld4(i32* %A, i32** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4i32_post_reg_ld4:
+;CHECK: ld4.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ %ld4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0i32(i32* %A)
+ %tmp = getelementptr i32* %A, i64 %inc
+ store i32* %tmp, i32** %ptr
+ ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4
+}
+
+declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0i32(i32*)
+
+
+define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld4(i32* %A, i32** %ptr) {
+;CHECK-LABEL: test_v2i32_post_imm_ld4:
+;CHECK: ld4.2s { v0, v1, v2, v3 }, [x0], #32
+ %ld4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0i32(i32* %A)
+ %tmp = getelementptr i32* %A, i32 8
+ store i32* %tmp, i32** %ptr
+ ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4
+}
+
+define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld4(i32* %A, i32** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2i32_post_reg_ld4:
+;CHECK: ld4.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ %ld4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0i32(i32* %A)
+ %tmp = getelementptr i32* %A, i64 %inc
+ store i32* %tmp, i32** %ptr
+ ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4
+}
+
+declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0i32(i32*)
+
+
+define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld4(i64* %A, i64** %ptr) {
+;CHECK-LABEL: test_v2i64_post_imm_ld4:
+;CHECK: ld4.2d { v0, v1, v2, v3 }, [x0], #64
+ %ld4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0i64(i64* %A)
+ %tmp = getelementptr i64* %A, i32 8
+ store i64* %tmp, i64** %ptr
+ ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4
+}
+
+define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld4(i64* %A, i64** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2i64_post_reg_ld4:
+;CHECK: ld4.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ %ld4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0i64(i64* %A)
+ %tmp = getelementptr i64* %A, i64 %inc
+ store i64* %tmp, i64** %ptr
+ ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4
+}
+
+declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0i64(i64*)
+
+
+define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld4(i64* %A, i64** %ptr) {
+;CHECK-LABEL: test_v1i64_post_imm_ld4:
+;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], #32
+ %ld4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0i64(i64* %A)
+ %tmp = getelementptr i64* %A, i32 4
+ store i64* %tmp, i64** %ptr
+ ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4
+}
+
+define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld4(i64* %A, i64** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v1i64_post_reg_ld4:
+;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ %ld4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0i64(i64* %A)
+ %tmp = getelementptr i64* %A, i64 %inc
+ store i64* %tmp, i64** %ptr
+ ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4
+}
+
+declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0i64(i64*)
+
+
+define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld4(float* %A, float** %ptr) {
+;CHECK-LABEL: test_v4f32_post_imm_ld4:
+;CHECK: ld4.4s { v0, v1, v2, v3 }, [x0], #64
+ %ld4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4.v4f32.p0f32(float* %A)
+ %tmp = getelementptr float* %A, i32 16
+ store float* %tmp, float** %ptr
+ ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4
+}
+
+define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld4(float* %A, float** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4f32_post_reg_ld4:
+;CHECK: ld4.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ %ld4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4.v4f32.p0f32(float* %A)
+ %tmp = getelementptr float* %A, i64 %inc
+ store float* %tmp, float** %ptr
+ ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4
+}
+
+declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4.v4f32.p0f32(float*)
+
+
+define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld4(float* %A, float** %ptr) {
+;CHECK-LABEL: test_v2f32_post_imm_ld4:
+;CHECK: ld4.2s { v0, v1, v2, v3 }, [x0], #32
+ %ld4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4.v2f32.p0f32(float* %A)
+ %tmp = getelementptr float* %A, i32 8
+ store float* %tmp, float** %ptr
+ ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4
+}
+
+define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld4(float* %A, float** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2f32_post_reg_ld4:
+;CHECK: ld4.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ %ld4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4.v2f32.p0f32(float* %A)
+ %tmp = getelementptr float* %A, i64 %inc
+ store float* %tmp, float** %ptr
+ ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4
+}
+
+declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4.v2f32.p0f32(float*)
+
+
+define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld4(double* %A, double** %ptr) {
+;CHECK-LABEL: test_v2f64_post_imm_ld4:
+;CHECK: ld4.2d { v0, v1, v2, v3 }, [x0], #64
+ %ld4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4.v2f64.p0f64(double* %A)
+ %tmp = getelementptr double* %A, i32 8
+ store double* %tmp, double** %ptr
+ ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4
+}
+
+define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld4(double* %A, double** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2f64_post_reg_ld4:
+;CHECK: ld4.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ %ld4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4.v2f64.p0f64(double* %A)
+ %tmp = getelementptr double* %A, i64 %inc
+ store double* %tmp, double** %ptr
+ ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4
+}
+
+declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4.v2f64.p0f64(double*)
+
+
+define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld4(double* %A, double** %ptr) {
+;CHECK-LABEL: test_v1f64_post_imm_ld4:
+;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], #32
+ %ld4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4.v1f64.p0f64(double* %A)
+ %tmp = getelementptr double* %A, i32 4
+ store double* %tmp, double** %ptr
+ ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4
+}
+
+define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld4(double* %A, double** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v1f64_post_reg_ld4:
+;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ %ld4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4.v1f64.p0f64(double* %A)
+ %tmp = getelementptr double* %A, i64 %inc
+ store double* %tmp, double** %ptr
+ ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4
+}
+
+declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4.v1f64.p0f64(double*)
+
+define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld1x2(i8* %A, i8** %ptr) {
+;CHECK-LABEL: test_v16i8_post_imm_ld1x2:
+;CHECK: ld1.16b { v0, v1 }, [x0], #32
+ %ld1x2 = tail call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x2.v16i8.p0i8(i8* %A)
+ %tmp = getelementptr i8* %A, i32 32
+ store i8* %tmp, i8** %ptr
+ ret { <16 x i8>, <16 x i8> } %ld1x2
+}
+
+define { <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld1x2(i8* %A, i8** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v16i8_post_reg_ld1x2:
+;CHECK: ld1.16b { v0, v1 }, [x0], x{{[0-9]+}}
+ %ld1x2 = tail call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x2.v16i8.p0i8(i8* %A)
+ %tmp = getelementptr i8* %A, i64 %inc
+ store i8* %tmp, i8** %ptr
+ ret { <16 x i8>, <16 x i8> } %ld1x2
+}
+
+declare { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x2.v16i8.p0i8(i8*)
+
+
+define { <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld1x2(i8* %A, i8** %ptr) {
+;CHECK-LABEL: test_v8i8_post_imm_ld1x2:
+;CHECK: ld1.8b { v0, v1 }, [x0], #16
+ %ld1x2 = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x2.v8i8.p0i8(i8* %A)
+ %tmp = getelementptr i8* %A, i32 16
+ store i8* %tmp, i8** %ptr
+ ret { <8 x i8>, <8 x i8> } %ld1x2
+}
+
+define { <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld1x2(i8* %A, i8** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v8i8_post_reg_ld1x2:
+;CHECK: ld1.8b { v0, v1 }, [x0], x{{[0-9]+}}
+ %ld1x2 = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x2.v8i8.p0i8(i8* %A)
+ %tmp = getelementptr i8* %A, i64 %inc
+ store i8* %tmp, i8** %ptr
+ ret { <8 x i8>, <8 x i8> } %ld1x2
+}
+
+declare { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x2.v8i8.p0i8(i8*)
+
+
+define { <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld1x2(i16* %A, i16** %ptr) {
+;CHECK-LABEL: test_v8i16_post_imm_ld1x2:
+;CHECK: ld1.8h { v0, v1 }, [x0], #32
+ %ld1x2 = tail call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16* %A)
+ %tmp = getelementptr i16* %A, i32 16
+ store i16* %tmp, i16** %ptr
+ ret { <8 x i16>, <8 x i16> } %ld1x2
+}
+
+define { <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld1x2(i16* %A, i16** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v8i16_post_reg_ld1x2:
+;CHECK: ld1.8h { v0, v1 }, [x0], x{{[0-9]+}}
+ %ld1x2 = tail call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16* %A)
+ %tmp = getelementptr i16* %A, i64 %inc
+ store i16* %tmp, i16** %ptr
+ ret { <8 x i16>, <8 x i16> } %ld1x2
+}
+
+declare { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16*)
+
+
+define { <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld1x2(i16* %A, i16** %ptr) {
+;CHECK-LABEL: test_v4i16_post_imm_ld1x2:
+;CHECK: ld1.4h { v0, v1 }, [x0], #16
+ %ld1x2 = tail call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16* %A)
+ %tmp = getelementptr i16* %A, i32 8
+ store i16* %tmp, i16** %ptr
+ ret { <4 x i16>, <4 x i16> } %ld1x2
+}
+
+define { <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld1x2(i16* %A, i16** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4i16_post_reg_ld1x2:
+;CHECK: ld1.4h { v0, v1 }, [x0], x{{[0-9]+}}
+ %ld1x2 = tail call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16* %A)
+ %tmp = getelementptr i16* %A, i64 %inc
+ store i16* %tmp, i16** %ptr
+ ret { <4 x i16>, <4 x i16> } %ld1x2
+}
+
+declare { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16*)
+
+
+define { <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld1x2(i32* %A, i32** %ptr) {
+;CHECK-LABEL: test_v4i32_post_imm_ld1x2:
+;CHECK: ld1.4s { v0, v1 }, [x0], #32
+ %ld1x2 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x2.v4i32.p0i32(i32* %A)
+ %tmp = getelementptr i32* %A, i32 8
+ store i32* %tmp, i32** %ptr
+ ret { <4 x i32>, <4 x i32> } %ld1x2
+}
+
+define { <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld1x2(i32* %A, i32** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4i32_post_reg_ld1x2:
+;CHECK: ld1.4s { v0, v1 }, [x0], x{{[0-9]+}}
+ %ld1x2 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x2.v4i32.p0i32(i32* %A)
+ %tmp = getelementptr i32* %A, i64 %inc
+ store i32* %tmp, i32** %ptr
+ ret { <4 x i32>, <4 x i32> } %ld1x2
+}
+
+declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x2.v4i32.p0i32(i32*)
+
+
+define { <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld1x2(i32* %A, i32** %ptr) {
+;CHECK-LABEL: test_v2i32_post_imm_ld1x2:
+;CHECK: ld1.2s { v0, v1 }, [x0], #16
+ %ld1x2 = tail call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x2.v2i32.p0i32(i32* %A)
+ %tmp = getelementptr i32* %A, i32 4
+ store i32* %tmp, i32** %ptr
+ ret { <2 x i32>, <2 x i32> } %ld1x2
+}
+
+define { <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld1x2(i32* %A, i32** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2i32_post_reg_ld1x2:
+;CHECK: ld1.2s { v0, v1 }, [x0], x{{[0-9]+}}
+ %ld1x2 = tail call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x2.v2i32.p0i32(i32* %A)
+ %tmp = getelementptr i32* %A, i64 %inc
+ store i32* %tmp, i32** %ptr
+ ret { <2 x i32>, <2 x i32> } %ld1x2
+}
+
+declare { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x2.v2i32.p0i32(i32*)
+
+
+define { <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld1x2(i64* %A, i64** %ptr) {
+;CHECK-LABEL: test_v2i64_post_imm_ld1x2:
+;CHECK: ld1.2d { v0, v1 }, [x0], #32
+ %ld1x2 = tail call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x2.v2i64.p0i64(i64* %A)
+ %tmp = getelementptr i64* %A, i32 4
+ store i64* %tmp, i64** %ptr
+ ret { <2 x i64>, <2 x i64> } %ld1x2
+}
+
+define { <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld1x2(i64* %A, i64** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2i64_post_reg_ld1x2:
+;CHECK: ld1.2d { v0, v1 }, [x0], x{{[0-9]+}}
+ %ld1x2 = tail call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x2.v2i64.p0i64(i64* %A)
+ %tmp = getelementptr i64* %A, i64 %inc
+ store i64* %tmp, i64** %ptr
+ ret { <2 x i64>, <2 x i64> } %ld1x2
+}
+
+declare { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x2.v2i64.p0i64(i64*)
+
+
+define { <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld1x2(i64* %A, i64** %ptr) {
+;CHECK-LABEL: test_v1i64_post_imm_ld1x2:
+;CHECK: ld1.1d { v0, v1 }, [x0], #16
+ %ld1x2 = tail call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x2.v1i64.p0i64(i64* %A)
+ %tmp = getelementptr i64* %A, i32 2
+ store i64* %tmp, i64** %ptr
+ ret { <1 x i64>, <1 x i64> } %ld1x2
+}
+
+define { <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld1x2(i64* %A, i64** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v1i64_post_reg_ld1x2:
+;CHECK: ld1.1d { v0, v1 }, [x0], x{{[0-9]+}}
+ %ld1x2 = tail call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x2.v1i64.p0i64(i64* %A)
+ %tmp = getelementptr i64* %A, i64 %inc
+ store i64* %tmp, i64** %ptr
+ ret { <1 x i64>, <1 x i64> } %ld1x2
+}
+
+declare { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x2.v1i64.p0i64(i64*)
+
+
+define { <4 x float>, <4 x float> } @test_v4f32_post_imm_ld1x2(float* %A, float** %ptr) {
+;CHECK-LABEL: test_v4f32_post_imm_ld1x2:
+;CHECK: ld1.4s { v0, v1 }, [x0], #32
+ %ld1x2 = tail call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x2.v4f32.p0f32(float* %A)
+ %tmp = getelementptr float* %A, i32 8
+ store float* %tmp, float** %ptr
+ ret { <4 x float>, <4 x float> } %ld1x2
+}
+
+define { <4 x float>, <4 x float> } @test_v4f32_post_reg_ld1x2(float* %A, float** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4f32_post_reg_ld1x2:
+;CHECK: ld1.4s { v0, v1 }, [x0], x{{[0-9]+}}
+ %ld1x2 = tail call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x2.v4f32.p0f32(float* %A)
+ %tmp = getelementptr float* %A, i64 %inc
+ store float* %tmp, float** %ptr
+ ret { <4 x float>, <4 x float> } %ld1x2
+}
+
+declare { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x2.v4f32.p0f32(float*)
+
+
+define { <2 x float>, <2 x float> } @test_v2f32_post_imm_ld1x2(float* %A, float** %ptr) {
+;CHECK-LABEL: test_v2f32_post_imm_ld1x2:
+;CHECK: ld1.2s { v0, v1 }, [x0], #16
+ %ld1x2 = tail call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x2.v2f32.p0f32(float* %A)
+ %tmp = getelementptr float* %A, i32 4
+ store float* %tmp, float** %ptr
+ ret { <2 x float>, <2 x float> } %ld1x2
+}
+
+define { <2 x float>, <2 x float> } @test_v2f32_post_reg_ld1x2(float* %A, float** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2f32_post_reg_ld1x2:
+;CHECK: ld1.2s { v0, v1 }, [x0], x{{[0-9]+}}
+ %ld1x2 = tail call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x2.v2f32.p0f32(float* %A)
+ %tmp = getelementptr float* %A, i64 %inc
+ store float* %tmp, float** %ptr
+ ret { <2 x float>, <2 x float> } %ld1x2
+}
+
+declare { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x2.v2f32.p0f32(float*)
+
+
+define { <2 x double>, <2 x double> } @test_v2f64_post_imm_ld1x2(double* %A, double** %ptr) {
+;CHECK-LABEL: test_v2f64_post_imm_ld1x2:
+;CHECK: ld1.2d { v0, v1 }, [x0], #32
+ %ld1x2 = tail call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x2.v2f64.p0f64(double* %A)
+ %tmp = getelementptr double* %A, i32 4
+ store double* %tmp, double** %ptr
+ ret { <2 x double>, <2 x double> } %ld1x2
+}
+
+define { <2 x double>, <2 x double> } @test_v2f64_post_reg_ld1x2(double* %A, double** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2f64_post_reg_ld1x2:
+;CHECK: ld1.2d { v0, v1 }, [x0], x{{[0-9]+}}
+ %ld1x2 = tail call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x2.v2f64.p0f64(double* %A)
+ %tmp = getelementptr double* %A, i64 %inc
+ store double* %tmp, double** %ptr
+ ret { <2 x double>, <2 x double> } %ld1x2
+}
+
+declare { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x2.v2f64.p0f64(double*)
+
+
+define { <1 x double>, <1 x double> } @test_v1f64_post_imm_ld1x2(double* %A, double** %ptr) {
+;CHECK-LABEL: test_v1f64_post_imm_ld1x2:
+;CHECK: ld1.1d { v0, v1 }, [x0], #16
+ %ld1x2 = tail call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x2.v1f64.p0f64(double* %A)
+ %tmp = getelementptr double* %A, i32 2
+ store double* %tmp, double** %ptr
+ ret { <1 x double>, <1 x double> } %ld1x2
+}
+
+define { <1 x double>, <1 x double> } @test_v1f64_post_reg_ld1x2(double* %A, double** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v1f64_post_reg_ld1x2:
+;CHECK: ld1.1d { v0, v1 }, [x0], x{{[0-9]+}}
+ %ld1x2 = tail call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x2.v1f64.p0f64(double* %A)
+ %tmp = getelementptr double* %A, i64 %inc
+ store double* %tmp, double** %ptr
+ ret { <1 x double>, <1 x double> } %ld1x2
+}
+
+declare { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x2.v1f64.p0f64(double*)
+
+
+define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld1x3(i8* %A, i8** %ptr) {
+;CHECK-LABEL: test_v16i8_post_imm_ld1x3:
+;CHECK: ld1.16b { v0, v1, v2 }, [x0], #48
+ %ld1x3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x3.v16i8.p0i8(i8* %A)
+ %tmp = getelementptr i8* %A, i32 48
+ store i8* %tmp, i8** %ptr
+ ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld1x3
+}
+
+define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld1x3(i8* %A, i8** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v16i8_post_reg_ld1x3:
+;CHECK: ld1.16b { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ %ld1x3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x3.v16i8.p0i8(i8* %A)
+ %tmp = getelementptr i8* %A, i64 %inc
+ store i8* %tmp, i8** %ptr
+ ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld1x3
+}
+
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x3.v16i8.p0i8(i8*)
+
+
+define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld1x3(i8* %A, i8** %ptr) {
+;CHECK-LABEL: test_v8i8_post_imm_ld1x3:
+;CHECK: ld1.8b { v0, v1, v2 }, [x0], #24
+ %ld1x3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x3.v8i8.p0i8(i8* %A)
+ %tmp = getelementptr i8* %A, i32 24
+ store i8* %tmp, i8** %ptr
+ ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld1x3
+}
+
+define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld1x3(i8* %A, i8** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v8i8_post_reg_ld1x3:
+;CHECK: ld1.8b { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ %ld1x3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x3.v8i8.p0i8(i8* %A)
+ %tmp = getelementptr i8* %A, i64 %inc
+ store i8* %tmp, i8** %ptr
+ ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld1x3
+}
+
+declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x3.v8i8.p0i8(i8*)
+
+
+define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld1x3(i16* %A, i16** %ptr) {
+;CHECK-LABEL: test_v8i16_post_imm_ld1x3:
+;CHECK: ld1.8h { v0, v1, v2 }, [x0], #48
+ %ld1x3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16* %A)
+ %tmp = getelementptr i16* %A, i32 24
+ store i16* %tmp, i16** %ptr
+ ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld1x3
+}
+
+define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld1x3(i16* %A, i16** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v8i16_post_reg_ld1x3:
+;CHECK: ld1.8h { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ %ld1x3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16* %A)
+ %tmp = getelementptr i16* %A, i64 %inc
+ store i16* %tmp, i16** %ptr
+ ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld1x3
+}
+
+declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16*)
+
+
+define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld1x3(i16* %A, i16** %ptr) {
+;CHECK-LABEL: test_v4i16_post_imm_ld1x3:
+;CHECK: ld1.4h { v0, v1, v2 }, [x0], #24
+ %ld1x3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16* %A)
+ %tmp = getelementptr i16* %A, i32 12
+ store i16* %tmp, i16** %ptr
+ ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld1x3
+}
+
+define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld1x3(i16* %A, i16** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4i16_post_reg_ld1x3:
+;CHECK: ld1.4h { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ %ld1x3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16* %A)
+ %tmp = getelementptr i16* %A, i64 %inc
+ store i16* %tmp, i16** %ptr
+ ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld1x3
+}
+
+declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16*)
+
+
+define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld1x3(i32* %A, i32** %ptr) {
+;CHECK-LABEL: test_v4i32_post_imm_ld1x3:
+;CHECK: ld1.4s { v0, v1, v2 }, [x0], #48
+ %ld1x3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x3.v4i32.p0i32(i32* %A)
+ %tmp = getelementptr i32* %A, i32 12
+ store i32* %tmp, i32** %ptr
+ ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld1x3
+}
+
+define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld1x3(i32* %A, i32** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4i32_post_reg_ld1x3:
+;CHECK: ld1.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ %ld1x3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x3.v4i32.p0i32(i32* %A)
+ %tmp = getelementptr i32* %A, i64 %inc
+ store i32* %tmp, i32** %ptr
+ ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld1x3
+}
+
+declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x3.v4i32.p0i32(i32*)
+
+
+define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld1x3(i32* %A, i32** %ptr) {
+;CHECK-LABEL: test_v2i32_post_imm_ld1x3:
+;CHECK: ld1.2s { v0, v1, v2 }, [x0], #24
+ %ld1x3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x3.v2i32.p0i32(i32* %A)
+ %tmp = getelementptr i32* %A, i32 6
+ store i32* %tmp, i32** %ptr
+ ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld1x3
+}
+
+define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld1x3(i32* %A, i32** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2i32_post_reg_ld1x3:
+;CHECK: ld1.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ %ld1x3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x3.v2i32.p0i32(i32* %A)
+ %tmp = getelementptr i32* %A, i64 %inc
+ store i32* %tmp, i32** %ptr
+ ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld1x3
+}
+
+declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x3.v2i32.p0i32(i32*)
+
+
+define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld1x3(i64* %A, i64** %ptr) {
+;CHECK-LABEL: test_v2i64_post_imm_ld1x3:
+;CHECK: ld1.2d { v0, v1, v2 }, [x0], #48
+ %ld1x3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x3.v2i64.p0i64(i64* %A)
+ %tmp = getelementptr i64* %A, i32 6
+ store i64* %tmp, i64** %ptr
+ ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld1x3
+}
+
+define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld1x3(i64* %A, i64** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2i64_post_reg_ld1x3:
+;CHECK: ld1.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ %ld1x3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x3.v2i64.p0i64(i64* %A)
+ %tmp = getelementptr i64* %A, i64 %inc
+ store i64* %tmp, i64** %ptr
+ ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld1x3
+}
+
+declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x3.v2i64.p0i64(i64*)
+
+
+define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld1x3(i64* %A, i64** %ptr) {
+;CHECK-LABEL: test_v1i64_post_imm_ld1x3:
+;CHECK: ld1.1d { v0, v1, v2 }, [x0], #24
+ %ld1x3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x3.v1i64.p0i64(i64* %A)
+ %tmp = getelementptr i64* %A, i32 3
+ store i64* %tmp, i64** %ptr
+ ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld1x3
+}
+
+define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld1x3(i64* %A, i64** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v1i64_post_reg_ld1x3:
+;CHECK: ld1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ %ld1x3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x3.v1i64.p0i64(i64* %A)
+ %tmp = getelementptr i64* %A, i64 %inc
+ store i64* %tmp, i64** %ptr
+ ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld1x3
+}
+
+declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x3.v1i64.p0i64(i64*)
+
+
+define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld1x3(float* %A, float** %ptr) {
+;CHECK-LABEL: test_v4f32_post_imm_ld1x3:
+;CHECK: ld1.4s { v0, v1, v2 }, [x0], #48
+ %ld1x3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x3.v4f32.p0f32(float* %A)
+ %tmp = getelementptr float* %A, i32 12
+ store float* %tmp, float** %ptr
+ ret { <4 x float>, <4 x float>, <4 x float> } %ld1x3
+}
+
+define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld1x3(float* %A, float** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4f32_post_reg_ld1x3:
+;CHECK: ld1.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ %ld1x3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x3.v4f32.p0f32(float* %A)
+ %tmp = getelementptr float* %A, i64 %inc
+ store float* %tmp, float** %ptr
+ ret { <4 x float>, <4 x float>, <4 x float> } %ld1x3
+}
+
+declare { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x3.v4f32.p0f32(float*)
+
+
+define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld1x3(float* %A, float** %ptr) {
+;CHECK-LABEL: test_v2f32_post_imm_ld1x3:
+;CHECK: ld1.2s { v0, v1, v2 }, [x0], #24
+ %ld1x3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x3.v2f32.p0f32(float* %A)
+ %tmp = getelementptr float* %A, i32 6
+ store float* %tmp, float** %ptr
+ ret { <2 x float>, <2 x float>, <2 x float> } %ld1x3
+}
+
+define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld1x3(float* %A, float** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2f32_post_reg_ld1x3:
+;CHECK: ld1.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ %ld1x3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x3.v2f32.p0f32(float* %A)
+ %tmp = getelementptr float* %A, i64 %inc
+ store float* %tmp, float** %ptr
+ ret { <2 x float>, <2 x float>, <2 x float> } %ld1x3
+}
+
+declare { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x3.v2f32.p0f32(float*)
+
+
+define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld1x3(double* %A, double** %ptr) {
+;CHECK-LABEL: test_v2f64_post_imm_ld1x3:
+;CHECK: ld1.2d { v0, v1, v2 }, [x0], #48
+ %ld1x3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x3.v2f64.p0f64(double* %A)
+ %tmp = getelementptr double* %A, i32 6
+ store double* %tmp, double** %ptr
+ ret { <2 x double>, <2 x double>, <2 x double> } %ld1x3
+}
+
+define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld1x3(double* %A, double** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2f64_post_reg_ld1x3:
+;CHECK: ld1.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ %ld1x3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x3.v2f64.p0f64(double* %A)
+ %tmp = getelementptr double* %A, i64 %inc
+ store double* %tmp, double** %ptr
+ ret { <2 x double>, <2 x double>, <2 x double> } %ld1x3
+}
+
+declare { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x3.v2f64.p0f64(double*)
+
+
+define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld1x3(double* %A, double** %ptr) {
+;CHECK-LABEL: test_v1f64_post_imm_ld1x3:
+;CHECK: ld1.1d { v0, v1, v2 }, [x0], #24
+ %ld1x3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x3.v1f64.p0f64(double* %A)
+ %tmp = getelementptr double* %A, i32 3
+ store double* %tmp, double** %ptr
+ ret { <1 x double>, <1 x double>, <1 x double> } %ld1x3
+}
+
+define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld1x3(double* %A, double** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v1f64_post_reg_ld1x3:
+;CHECK: ld1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ %ld1x3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x3.v1f64.p0f64(double* %A)
+ %tmp = getelementptr double* %A, i64 %inc
+ store double* %tmp, double** %ptr
+ ret { <1 x double>, <1 x double>, <1 x double> } %ld1x3
+}
+
+declare { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x3.v1f64.p0f64(double*)
+
+
+define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld1x4(i8* %A, i8** %ptr) {
+;CHECK-LABEL: test_v16i8_post_imm_ld1x4:
+;CHECK: ld1.16b { v0, v1, v2, v3 }, [x0], #64
+ %ld1x4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x4.v16i8.p0i8(i8* %A)
+ %tmp = getelementptr i8* %A, i32 64
+ store i8* %tmp, i8** %ptr
+ ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld1x4
+}
+
+define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld1x4(i8* %A, i8** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v16i8_post_reg_ld1x4:
+;CHECK: ld1.16b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ %ld1x4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x4.v16i8.p0i8(i8* %A)
+ %tmp = getelementptr i8* %A, i64 %inc
+ store i8* %tmp, i8** %ptr
+ ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld1x4
+}
+
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x4.v16i8.p0i8(i8*)
+
+
+define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld1x4(i8* %A, i8** %ptr) {
+;CHECK-LABEL: test_v8i8_post_imm_ld1x4:
+;CHECK: ld1.8b { v0, v1, v2, v3 }, [x0], #32
+ %ld1x4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x4.v8i8.p0i8(i8* %A)
+ %tmp = getelementptr i8* %A, i32 32
+ store i8* %tmp, i8** %ptr
+ ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld1x4
+}
+
+define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld1x4(i8* %A, i8** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v8i8_post_reg_ld1x4:
+;CHECK: ld1.8b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ %ld1x4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x4.v8i8.p0i8(i8* %A)
+ %tmp = getelementptr i8* %A, i64 %inc
+ store i8* %tmp, i8** %ptr
+ ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld1x4
+}
+
+declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x4.v8i8.p0i8(i8*)
+
+
+define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld1x4(i16* %A, i16** %ptr) {
+;CHECK-LABEL: test_v8i16_post_imm_ld1x4:
+;CHECK: ld1.8h { v0, v1, v2, v3 }, [x0], #64
+ %ld1x4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16* %A)
+ %tmp = getelementptr i16* %A, i32 32
+ store i16* %tmp, i16** %ptr
+ ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld1x4
+}
+
+define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld1x4(i16* %A, i16** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v8i16_post_reg_ld1x4:
+;CHECK: ld1.8h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ %ld1x4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16* %A)
+ %tmp = getelementptr i16* %A, i64 %inc
+ store i16* %tmp, i16** %ptr
+ ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld1x4
+}
+
+declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16*)
+
+
+define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld1x4(i16* %A, i16** %ptr) {
+;CHECK-LABEL: test_v4i16_post_imm_ld1x4:
+;CHECK: ld1.4h { v0, v1, v2, v3 }, [x0], #32
+ %ld1x4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16* %A)
+ %tmp = getelementptr i16* %A, i32 16
+ store i16* %tmp, i16** %ptr
+ ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld1x4
+}
+
+define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld1x4(i16* %A, i16** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4i16_post_reg_ld1x4:
+;CHECK: ld1.4h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ %ld1x4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16* %A)
+ %tmp = getelementptr i16* %A, i64 %inc
+ store i16* %tmp, i16** %ptr
+ ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld1x4
+}
+
+declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16*)
+
+
+define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld1x4(i32* %A, i32** %ptr) {
+;CHECK-LABEL: test_v4i32_post_imm_ld1x4:
+;CHECK: ld1.4s { v0, v1, v2, v3 }, [x0], #64
+ %ld1x4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x4.v4i32.p0i32(i32* %A)
+ %tmp = getelementptr i32* %A, i32 16
+ store i32* %tmp, i32** %ptr
+ ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld1x4
+}
+
+define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld1x4(i32* %A, i32** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4i32_post_reg_ld1x4:
+;CHECK: ld1.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ %ld1x4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x4.v4i32.p0i32(i32* %A)
+ %tmp = getelementptr i32* %A, i64 %inc
+ store i32* %tmp, i32** %ptr
+ ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld1x4
+}
+
+declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x4.v4i32.p0i32(i32*)
+
+
+define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld1x4(i32* %A, i32** %ptr) {
+;CHECK-LABEL: test_v2i32_post_imm_ld1x4:
+;CHECK: ld1.2s { v0, v1, v2, v3 }, [x0], #32
+ %ld1x4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x4.v2i32.p0i32(i32* %A)
+ %tmp = getelementptr i32* %A, i32 8
+ store i32* %tmp, i32** %ptr
+ ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld1x4
+}
+
+define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld1x4(i32* %A, i32** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2i32_post_reg_ld1x4:
+;CHECK: ld1.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ %ld1x4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x4.v2i32.p0i32(i32* %A)
+ %tmp = getelementptr i32* %A, i64 %inc
+ store i32* %tmp, i32** %ptr
+ ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld1x4
+}
+
+declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x4.v2i32.p0i32(i32*)
+
+
+define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld1x4(i64* %A, i64** %ptr) {
+;CHECK-LABEL: test_v2i64_post_imm_ld1x4:
+;CHECK: ld1.2d { v0, v1, v2, v3 }, [x0], #64
+ %ld1x4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x4.v2i64.p0i64(i64* %A)
+ %tmp = getelementptr i64* %A, i32 8
+ store i64* %tmp, i64** %ptr
+ ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld1x4
+}
+
+define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld1x4(i64* %A, i64** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2i64_post_reg_ld1x4:
+;CHECK: ld1.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ %ld1x4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x4.v2i64.p0i64(i64* %A)
+ %tmp = getelementptr i64* %A, i64 %inc
+ store i64* %tmp, i64** %ptr
+ ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld1x4
+}
+
+declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x4.v2i64.p0i64(i64*)
+
+
+define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld1x4(i64* %A, i64** %ptr) {
+;CHECK-LABEL: test_v1i64_post_imm_ld1x4:
+;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], #32
+ %ld1x4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x4.v1i64.p0i64(i64* %A)
+ %tmp = getelementptr i64* %A, i32 4
+ store i64* %tmp, i64** %ptr
+ ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld1x4
+}
+
+define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld1x4(i64* %A, i64** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v1i64_post_reg_ld1x4:
+;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ %ld1x4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x4.v1i64.p0i64(i64* %A)
+ %tmp = getelementptr i64* %A, i64 %inc
+ store i64* %tmp, i64** %ptr
+ ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld1x4
+}
+
+declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x4.v1i64.p0i64(i64*)
+
+
+define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld1x4(float* %A, float** %ptr) {
+;CHECK-LABEL: test_v4f32_post_imm_ld1x4:
+;CHECK: ld1.4s { v0, v1, v2, v3 }, [x0], #64
+ %ld1x4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x4.v4f32.p0f32(float* %A)
+ %tmp = getelementptr float* %A, i32 16
+ store float* %tmp, float** %ptr
+ ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld1x4
+}
+
+define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld1x4(float* %A, float** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v4f32_post_reg_ld1x4:
+;CHECK: ld1.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ %ld1x4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x4.v4f32.p0f32(float* %A)
+ %tmp = getelementptr float* %A, i64 %inc
+ store float* %tmp, float** %ptr
+ ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld1x4
+}
+
+declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x4.v4f32.p0f32(float*)
+
+
+define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld1x4(float* %A, float** %ptr) {
+;CHECK-LABEL: test_v2f32_post_imm_ld1x4:
+;CHECK: ld1.2s { v0, v1, v2, v3 }, [x0], #32
+ %ld1x4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x4.v2f32.p0f32(float* %A)
+ %tmp = getelementptr float* %A, i32 8
+ store float* %tmp, float** %ptr
+ ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld1x4
+}
+
+define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld1x4(float* %A, float** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2f32_post_reg_ld1x4:
+;CHECK: ld1.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ %ld1x4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x4.v2f32.p0f32(float* %A)
+ %tmp = getelementptr float* %A, i64 %inc
+ store float* %tmp, float** %ptr
+ ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld1x4
+}
+
+declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x4.v2f32.p0f32(float*)
+
+
+define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld1x4(double* %A, double** %ptr) {
+;CHECK-LABEL: test_v2f64_post_imm_ld1x4:
+;CHECK: ld1.2d { v0, v1, v2, v3 }, [x0], #64
+ %ld1x4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x4.v2f64.p0f64(double* %A)
+ %tmp = getelementptr double* %A, i32 8
+ store double* %tmp, double** %ptr
+ ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld1x4
+}
+
+define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld1x4(double* %A, double** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v2f64_post_reg_ld1x4:
+;CHECK: ld1.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ %ld1x4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x4.v2f64.p0f64(double* %A)
+ %tmp = getelementptr double* %A, i64 %inc
+ store double* %tmp, double** %ptr
+ ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld1x4
+}
+
+declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x4.v2f64.p0f64(double*)
+
+
+define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld1x4(double* %A, double** %ptr) {
+;CHECK-LABEL: test_v1f64_post_imm_ld1x4:
+;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], #32
+ %ld1x4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x4.v1f64.p0f64(double* %A)
+ %tmp = getelementptr double* %A, i32 4
+ store double* %tmp, double** %ptr
+ ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld1x4
+}
+
+define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld1x4(double* %A, double** %ptr, i64 %inc) {
+;CHECK-LABEL: test_v1f64_post_reg_ld1x4:
+;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ %ld1x4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x4.v1f64.p0f64(double* %A)
+ %tmp = getelementptr double* %A, i64 %inc
+ store double* %tmp, double** %ptr
+ ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld1x4
+}
+
+declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x4.v1f64.p0f64(double*)
+
+
+define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld2r(i8* %A, i8** %ptr) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_ld2r:
+;CHECK: ld2r.16b { v0, v1 }, [x0], #2
+ %ld2 = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8* %A)
+ %tmp = getelementptr i8* %A, i32 2
+ store i8* %tmp, i8** %ptr
+ ret { <16 x i8>, <16 x i8> } %ld2
+}
+
+define { <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld2r(i8* %A, i8** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_ld2r:
+;CHECK: ld2r.16b { v0, v1 }, [x0], x{{[0-9]+}}
+ %ld2 = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8* %A)
+ %tmp = getelementptr i8* %A, i64 %inc
+ store i8* %tmp, i8** %ptr
+ ret { <16 x i8>, <16 x i8> } %ld2
+}
+
+declare { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8*) nounwind readonly
+
+
+define { <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld2r(i8* %A, i8** %ptr) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_ld2r:
+;CHECK: ld2r.8b { v0, v1 }, [x0], #2
+ %ld2 = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %A)
+ %tmp = getelementptr i8* %A, i32 2
+ store i8* %tmp, i8** %ptr
+ ret { <8 x i8>, <8 x i8> } %ld2
+}
+
+define { <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld2r(i8* %A, i8** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_ld2r:
+;CHECK: ld2r.8b { v0, v1 }, [x0], x{{[0-9]+}}
+ %ld2 = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %A)
+ %tmp = getelementptr i8* %A, i64 %inc
+ store i8* %tmp, i8** %ptr
+ ret { <8 x i8>, <8 x i8> } %ld2
+}
+
+declare { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8*) nounwind readonly
+
+
+define { <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld2r(i16* %A, i16** %ptr) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_ld2r:
+;CHECK: ld2r.8h { v0, v1 }, [x0], #4
+ %ld2 = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16* %A)
+ %tmp = getelementptr i16* %A, i32 2
+ store i16* %tmp, i16** %ptr
+ ret { <8 x i16>, <8 x i16> } %ld2
+}
+
+define { <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld2r(i16* %A, i16** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_ld2r:
+;CHECK: ld2r.8h { v0, v1 }, [x0], x{{[0-9]+}}
+ %ld2 = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16* %A)
+ %tmp = getelementptr i16* %A, i64 %inc
+ store i16* %tmp, i16** %ptr
+ ret { <8 x i16>, <8 x i16> } %ld2
+}
+
+declare { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16*) nounwind readonly
+
+
+define { <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld2r(i16* %A, i16** %ptr) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_ld2r:
+;CHECK: ld2r.4h { v0, v1 }, [x0], #4
+ %ld2 = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* %A)
+ %tmp = getelementptr i16* %A, i32 2
+ store i16* %tmp, i16** %ptr
+ ret { <4 x i16>, <4 x i16> } %ld2
+}
+
+define { <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld2r(i16* %A, i16** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_ld2r:
+;CHECK: ld2r.4h { v0, v1 }, [x0], x{{[0-9]+}}
+ %ld2 = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* %A)
+ %tmp = getelementptr i16* %A, i64 %inc
+ store i16* %tmp, i16** %ptr
+ ret { <4 x i16>, <4 x i16> } %ld2
+}
+
+declare { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16*) nounwind readonly
+
+
+define { <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld2r(i32* %A, i32** %ptr) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_ld2r:
+;CHECK: ld2r.4s { v0, v1 }, [x0], #8
+ %ld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2r.v4i32.p0i32(i32* %A)
+ %tmp = getelementptr i32* %A, i32 2
+ store i32* %tmp, i32** %ptr
+ ret { <4 x i32>, <4 x i32> } %ld2
+}
+
+define { <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld2r(i32* %A, i32** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_ld2r:
+;CHECK: ld2r.4s { v0, v1 }, [x0], x{{[0-9]+}}
+ %ld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2r.v4i32.p0i32(i32* %A)
+ %tmp = getelementptr i32* %A, i64 %inc
+ store i32* %tmp, i32** %ptr
+ ret { <4 x i32>, <4 x i32> } %ld2
+}
+
+declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2r.v4i32.p0i32(i32*) nounwind readonly
+
+define { <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld2r(i32* %A, i32** %ptr) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_ld2r:
+;CHECK: ld2r.2s { v0, v1 }, [x0], #8
+ %ld2 = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32* %A)
+ %tmp = getelementptr i32* %A, i32 2
+ store i32* %tmp, i32** %ptr
+ ret { <2 x i32>, <2 x i32> } %ld2
+}
+
+define { <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld2r(i32* %A, i32** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_ld2r:
+;CHECK: ld2r.2s { v0, v1 }, [x0], x{{[0-9]+}}
+ %ld2 = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32* %A)
+ %tmp = getelementptr i32* %A, i64 %inc
+ store i32* %tmp, i32** %ptr
+ ret { <2 x i32>, <2 x i32> } %ld2
+}
+
+declare { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32*) nounwind readonly
+
+
+define { <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld2r(i64* %A, i64** %ptr) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_ld2r:
+;CHECK: ld2r.2d { v0, v1 }, [x0], #16
+ %ld2 = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64* %A)
+ %tmp = getelementptr i64* %A, i32 2
+ store i64* %tmp, i64** %ptr
+ ret { <2 x i64>, <2 x i64> } %ld2
+}
+
+define { <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld2r(i64* %A, i64** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_ld2r:
+;CHECK: ld2r.2d { v0, v1 }, [x0], x{{[0-9]+}}
+ %ld2 = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64* %A)
+ %tmp = getelementptr i64* %A, i64 %inc
+ store i64* %tmp, i64** %ptr
+ ret { <2 x i64>, <2 x i64> } %ld2
+}
+
+declare { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64*) nounwind readonly
+
+define { <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld2r(i64* %A, i64** %ptr) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_ld2r:
+;CHECK: ld2r.1d { v0, v1 }, [x0], #16
+ %ld2 = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64* %A)
+ %tmp = getelementptr i64* %A, i32 2
+ store i64* %tmp, i64** %ptr
+ ret { <1 x i64>, <1 x i64> } %ld2
+}
+
+define { <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld2r(i64* %A, i64** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_ld2r:
+;CHECK: ld2r.1d { v0, v1 }, [x0], x{{[0-9]+}}
+ %ld2 = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64* %A)
+ %tmp = getelementptr i64* %A, i64 %inc
+ store i64* %tmp, i64** %ptr
+ ret { <1 x i64>, <1 x i64> } %ld2
+}
+
+declare { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64*) nounwind readonly
+
+
+define { <4 x float>, <4 x float> } @test_v4f32_post_imm_ld2r(float* %A, float** %ptr) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_ld2r:
+;CHECK: ld2r.4s { v0, v1 }, [x0], #8
+ %ld2 = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2r.v4f32.p0f32(float* %A)
+ %tmp = getelementptr float* %A, i32 2
+ store float* %tmp, float** %ptr
+ ret { <4 x float>, <4 x float> } %ld2
+}
+
+define { <4 x float>, <4 x float> } @test_v4f32_post_reg_ld2r(float* %A, float** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_ld2r:
+;CHECK: ld2r.4s { v0, v1 }, [x0], x{{[0-9]+}}
+ %ld2 = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2r.v4f32.p0f32(float* %A)
+ %tmp = getelementptr float* %A, i64 %inc
+ store float* %tmp, float** %ptr
+ ret { <4 x float>, <4 x float> } %ld2
+}
+
+declare { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2r.v4f32.p0f32(float*) nounwind readonly
+
+define { <2 x float>, <2 x float> } @test_v2f32_post_imm_ld2r(float* %A, float** %ptr) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_ld2r:
+;CHECK: ld2r.2s { v0, v1 }, [x0], #8
+ %ld2 = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2r.v2f32.p0f32(float* %A)
+ %tmp = getelementptr float* %A, i32 2
+ store float* %tmp, float** %ptr
+ ret { <2 x float>, <2 x float> } %ld2
+}
+
+define { <2 x float>, <2 x float> } @test_v2f32_post_reg_ld2r(float* %A, float** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_ld2r:
+;CHECK: ld2r.2s { v0, v1 }, [x0], x{{[0-9]+}}
+ %ld2 = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2r.v2f32.p0f32(float* %A)
+ %tmp = getelementptr float* %A, i64 %inc
+ store float* %tmp, float** %ptr
+ ret { <2 x float>, <2 x float> } %ld2
+}
+
+declare { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2r.v2f32.p0f32(float*) nounwind readonly
+
+
+define { <2 x double>, <2 x double> } @test_v2f64_post_imm_ld2r(double* %A, double** %ptr) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_ld2r:
+;CHECK: ld2r.2d { v0, v1 }, [x0], #16
+ %ld2 = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2r.v2f64.p0f64(double* %A)
+ %tmp = getelementptr double* %A, i32 2
+ store double* %tmp, double** %ptr
+ ret { <2 x double>, <2 x double> } %ld2
+}
+
+define { <2 x double>, <2 x double> } @test_v2f64_post_reg_ld2r(double* %A, double** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_ld2r:
+;CHECK: ld2r.2d { v0, v1 }, [x0], x{{[0-9]+}}
+ %ld2 = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2r.v2f64.p0f64(double* %A)
+ %tmp = getelementptr double* %A, i64 %inc
+ store double* %tmp, double** %ptr
+ ret { <2 x double>, <2 x double> } %ld2
+}
+
+declare { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2r.v2f64.p0f64(double*) nounwind readonly
+
+define { <1 x double>, <1 x double> } @test_v1f64_post_imm_ld2r(double* %A, double** %ptr) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_ld2r:
+;CHECK: ld2r.1d { v0, v1 }, [x0], #16
+ %ld2 = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2r.v1f64.p0f64(double* %A)
+ %tmp = getelementptr double* %A, i32 2
+ store double* %tmp, double** %ptr
+ ret { <1 x double>, <1 x double> } %ld2
+}
+
+define { <1 x double>, <1 x double> } @test_v1f64_post_reg_ld2r(double* %A, double** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_ld2r:
+;CHECK: ld2r.1d { v0, v1 }, [x0], x{{[0-9]+}}
+ %ld2 = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2r.v1f64.p0f64(double* %A)
+ %tmp = getelementptr double* %A, i64 %inc
+ store double* %tmp, double** %ptr
+ ret { <1 x double>, <1 x double> } %ld2
+}
+
+declare { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2r.v1f64.p0f64(double*) nounwind readonly
+
+
+define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld3r(i8* %A, i8** %ptr) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_ld3r:
+;CHECK: ld3r.16b { v0, v1, v2 }, [x0], #3
+ %ld3 = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8* %A)
+ %tmp = getelementptr i8* %A, i32 3
+ store i8* %tmp, i8** %ptr
+ ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3
+}
+
+define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld3r(i8* %A, i8** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_ld3r:
+;CHECK: ld3r.16b { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ %ld3 = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8* %A)
+ %tmp = getelementptr i8* %A, i64 %inc
+ store i8* %tmp, i8** %ptr
+ ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3
+}
+
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8*) nounwind readonly
+
+
+define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld3r(i8* %A, i8** %ptr) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_ld3r:
+;CHECK: ld3r.8b { v0, v1, v2 }, [x0], #3
+ %ld3 = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %A)
+ %tmp = getelementptr i8* %A, i32 3
+ store i8* %tmp, i8** %ptr
+ ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3
+}
+
+define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld3r(i8* %A, i8** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_ld3r:
+;CHECK: ld3r.8b { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ %ld3 = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %A)
+ %tmp = getelementptr i8* %A, i64 %inc
+ store i8* %tmp, i8** %ptr
+ ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3
+}
+
+declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8*) nounwind readonly
+
+
+define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld3r(i16* %A, i16** %ptr) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_ld3r:
+;CHECK: ld3r.8h { v0, v1, v2 }, [x0], #6
+ %ld3 = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16* %A)
+ %tmp = getelementptr i16* %A, i32 3
+ store i16* %tmp, i16** %ptr
+ ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3
+}
+
+define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld3r(i16* %A, i16** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_ld3r:
+;CHECK: ld3r.8h { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ %ld3 = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16* %A)
+ %tmp = getelementptr i16* %A, i64 %inc
+ store i16* %tmp, i16** %ptr
+ ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3
+}
+
+declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16*) nounwind readonly
+
+
+define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld3r(i16* %A, i16** %ptr) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_ld3r:
+;CHECK: ld3r.4h { v0, v1, v2 }, [x0], #6
+ %ld3 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* %A)
+ %tmp = getelementptr i16* %A, i32 3
+ store i16* %tmp, i16** %ptr
+ ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3
+}
+
+define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld3r(i16* %A, i16** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_ld3r:
+;CHECK: ld3r.4h { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ %ld3 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* %A)
+ %tmp = getelementptr i16* %A, i64 %inc
+ store i16* %tmp, i16** %ptr
+ ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3
+}
+
+declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16*) nounwind readonly
+
+
+define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld3r(i32* %A, i32** %ptr) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_ld3r:
+;CHECK: ld3r.4s { v0, v1, v2 }, [x0], #12
+ %ld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3r.v4i32.p0i32(i32* %A)
+ %tmp = getelementptr i32* %A, i32 3
+ store i32* %tmp, i32** %ptr
+ ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3
+}
+
+define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld3r(i32* %A, i32** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_ld3r:
+;CHECK: ld3r.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ %ld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3r.v4i32.p0i32(i32* %A)
+ %tmp = getelementptr i32* %A, i64 %inc
+ store i32* %tmp, i32** %ptr
+ ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3
+}
+
+declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3r.v4i32.p0i32(i32*) nounwind readonly
+
+define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld3r(i32* %A, i32** %ptr) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_ld3r:
+;CHECK: ld3r.2s { v0, v1, v2 }, [x0], #12
+ %ld3 = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32* %A)
+ %tmp = getelementptr i32* %A, i32 3
+ store i32* %tmp, i32** %ptr
+ ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3
+}
+
+define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld3r(i32* %A, i32** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_ld3r:
+;CHECK: ld3r.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ %ld3 = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32* %A)
+ %tmp = getelementptr i32* %A, i64 %inc
+ store i32* %tmp, i32** %ptr
+ ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3
+}
+
+declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32*) nounwind readonly
+
+
+define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld3r(i64* %A, i64** %ptr) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_ld3r:
+;CHECK: ld3r.2d { v0, v1, v2 }, [x0], #24
+ %ld3 = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64* %A)
+ %tmp = getelementptr i64* %A, i32 3
+ store i64* %tmp, i64** %ptr
+ ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3
+}
+
+define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld3r(i64* %A, i64** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_ld3r:
+;CHECK: ld3r.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ %ld3 = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64* %A)
+ %tmp = getelementptr i64* %A, i64 %inc
+ store i64* %tmp, i64** %ptr
+ ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3
+}
+
+declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64*) nounwind readonly
+
+define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld3r(i64* %A, i64** %ptr) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_ld3r:
+;CHECK: ld3r.1d { v0, v1, v2 }, [x0], #24
+ %ld3 = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64* %A)
+ %tmp = getelementptr i64* %A, i32 3
+ store i64* %tmp, i64** %ptr
+ ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3
+}
+
+define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld3r(i64* %A, i64** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_ld3r:
+;CHECK: ld3r.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ %ld3 = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64* %A)
+ %tmp = getelementptr i64* %A, i64 %inc
+ store i64* %tmp, i64** %ptr
+ ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3
+}
+
+declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64*) nounwind readonly
+
+
+define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld3r(float* %A, float** %ptr) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_ld3r:
+;CHECK: ld3r.4s { v0, v1, v2 }, [x0], #12
+ %ld3 = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3r.v4f32.p0f32(float* %A)
+ %tmp = getelementptr float* %A, i32 3
+ store float* %tmp, float** %ptr
+ ret { <4 x float>, <4 x float>, <4 x float> } %ld3
+}
+
+define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld3r(float* %A, float** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_ld3r:
+;CHECK: ld3r.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ %ld3 = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3r.v4f32.p0f32(float* %A)
+ %tmp = getelementptr float* %A, i64 %inc
+ store float* %tmp, float** %ptr
+ ret { <4 x float>, <4 x float>, <4 x float> } %ld3
+}
+
+declare { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3r.v4f32.p0f32(float*) nounwind readonly
+
+define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld3r(float* %A, float** %ptr) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_ld3r:
+;CHECK: ld3r.2s { v0, v1, v2 }, [x0], #12
+ %ld3 = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3r.v2f32.p0f32(float* %A)
+ %tmp = getelementptr float* %A, i32 3
+ store float* %tmp, float** %ptr
+ ret { <2 x float>, <2 x float>, <2 x float> } %ld3
+}
+
+define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld3r(float* %A, float** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_ld3r:
+;CHECK: ld3r.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ %ld3 = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3r.v2f32.p0f32(float* %A)
+ %tmp = getelementptr float* %A, i64 %inc
+ store float* %tmp, float** %ptr
+ ret { <2 x float>, <2 x float>, <2 x float> } %ld3
+}
+
+declare { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3r.v2f32.p0f32(float*) nounwind readonly
+
+
+define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld3r(double* %A, double** %ptr) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_ld3r:
+;CHECK: ld3r.2d { v0, v1, v2 }, [x0], #24
+ %ld3 = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3r.v2f64.p0f64(double* %A)
+ %tmp = getelementptr double* %A, i32 3
+ store double* %tmp, double** %ptr
+ ret { <2 x double>, <2 x double>, <2 x double> } %ld3
+}
+
+define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld3r(double* %A, double** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_ld3r:
+;CHECK: ld3r.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ %ld3 = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3r.v2f64.p0f64(double* %A)
+ %tmp = getelementptr double* %A, i64 %inc
+ store double* %tmp, double** %ptr
+ ret { <2 x double>, <2 x double>, <2 x double> } %ld3
+}
+
+declare { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3r.v2f64.p0f64(double*) nounwind readonly
+
+define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld3r(double* %A, double** %ptr) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_ld3r:
+;CHECK: ld3r.1d { v0, v1, v2 }, [x0], #24
+ %ld3 = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3r.v1f64.p0f64(double* %A)
+ %tmp = getelementptr double* %A, i32 3
+ store double* %tmp, double** %ptr
+ ret { <1 x double>, <1 x double>, <1 x double> } %ld3
+}
+
+define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld3r(double* %A, double** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_ld3r:
+;CHECK: ld3r.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ %ld3 = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3r.v1f64.p0f64(double* %A)
+ %tmp = getelementptr double* %A, i64 %inc
+ store double* %tmp, double** %ptr
+ ret { <1 x double>, <1 x double>, <1 x double> } %ld3
+}
+
+declare { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3r.v1f64.p0f64(double*) nounwind readonly
+
+
+define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld4r(i8* %A, i8** %ptr) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_ld4r:
+;CHECK: ld4r.16b { v0, v1, v2, v3 }, [x0], #4
+ %ld4 = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8* %A)
+ %tmp = getelementptr i8* %A, i32 4
+ store i8* %tmp, i8** %ptr
+ ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4
+}
+
+define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld4r(i8* %A, i8** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_ld4r:
+;CHECK: ld4r.16b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ %ld4 = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8* %A)
+ %tmp = getelementptr i8* %A, i64 %inc
+ store i8* %tmp, i8** %ptr
+ ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4
+}
+
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8*) nounwind readonly
+
+
+define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld4r(i8* %A, i8** %ptr) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_ld4r:
+;CHECK: ld4r.8b { v0, v1, v2, v3 }, [x0], #4
+ %ld4 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %A)
+ %tmp = getelementptr i8* %A, i32 4
+ store i8* %tmp, i8** %ptr
+ ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4
+}
+
+define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld4r(i8* %A, i8** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_ld4r:
+;CHECK: ld4r.8b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ %ld4 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %A)
+ %tmp = getelementptr i8* %A, i64 %inc
+ store i8* %tmp, i8** %ptr
+ ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4
+}
+
+declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8*) nounwind readonly
+
+
+define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld4r(i16* %A, i16** %ptr) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_ld4r:
+;CHECK: ld4r.8h { v0, v1, v2, v3 }, [x0], #8
+ %ld4 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16* %A)
+ %tmp = getelementptr i16* %A, i32 4
+ store i16* %tmp, i16** %ptr
+ ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4
+}
+
+define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld4r(i16* %A, i16** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_ld4r:
+;CHECK: ld4r.8h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ %ld4 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16* %A)
+ %tmp = getelementptr i16* %A, i64 %inc
+ store i16* %tmp, i16** %ptr
+ ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4
+}
+
+declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16*) nounwind readonly
+
+
+define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld4r(i16* %A, i16** %ptr) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_ld4r:
+;CHECK: ld4r.4h { v0, v1, v2, v3 }, [x0], #8
+ %ld4 = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* %A)
+ %tmp = getelementptr i16* %A, i32 4
+ store i16* %tmp, i16** %ptr
+ ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4
+}
+
+define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld4r(i16* %A, i16** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_ld4r:
+;CHECK: ld4r.4h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ %ld4 = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* %A)
+ %tmp = getelementptr i16* %A, i64 %inc
+ store i16* %tmp, i16** %ptr
+ ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4
+}
+
+declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16*) nounwind readonly
+
+
+define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld4r(i32* %A, i32** %ptr) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_ld4r:
+;CHECK: ld4r.4s { v0, v1, v2, v3 }, [x0], #16
+ %ld4 = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4r.v4i32.p0i32(i32* %A)
+ %tmp = getelementptr i32* %A, i32 4
+ store i32* %tmp, i32** %ptr
+ ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4
+}
+
+define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld4r(i32* %A, i32** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_ld4r:
+;CHECK: ld4r.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ %ld4 = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4r.v4i32.p0i32(i32* %A)
+ %tmp = getelementptr i32* %A, i64 %inc
+ store i32* %tmp, i32** %ptr
+ ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4
+}
+
+declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4r.v4i32.p0i32(i32*) nounwind readonly
+
+define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld4r(i32* %A, i32** %ptr) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_ld4r:
+;CHECK: ld4r.2s { v0, v1, v2, v3 }, [x0], #16
+ %ld4 = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32* %A)
+ %tmp = getelementptr i32* %A, i32 4
+ store i32* %tmp, i32** %ptr
+ ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4
+}
+
+define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld4r(i32* %A, i32** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_ld4r:
+;CHECK: ld4r.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ %ld4 = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32* %A)
+ %tmp = getelementptr i32* %A, i64 %inc
+ store i32* %tmp, i32** %ptr
+ ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4
+}
+
+declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32*) nounwind readonly
+
+
+define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld4r(i64* %A, i64** %ptr) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_ld4r:
+;CHECK: ld4r.2d { v0, v1, v2, v3 }, [x0], #32
+ %ld4 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64* %A)
+ %tmp = getelementptr i64* %A, i32 4
+ store i64* %tmp, i64** %ptr
+ ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4
+}
+
+define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld4r(i64* %A, i64** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_ld4r:
+;CHECK: ld4r.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ %ld4 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64* %A)
+ %tmp = getelementptr i64* %A, i64 %inc
+ store i64* %tmp, i64** %ptr
+ ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4
+}
+
+declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64*) nounwind readonly
+
+define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld4r(i64* %A, i64** %ptr) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_ld4r:
+;CHECK: ld4r.1d { v0, v1, v2, v3 }, [x0], #32
+ %ld4 = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64* %A)
+ %tmp = getelementptr i64* %A, i32 4
+ store i64* %tmp, i64** %ptr
+ ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4
+}
+
+define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld4r(i64* %A, i64** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_ld4r:
+;CHECK: ld4r.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ %ld4 = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64* %A)
+ %tmp = getelementptr i64* %A, i64 %inc
+ store i64* %tmp, i64** %ptr
+ ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4
+}
+
+declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64*) nounwind readonly
+
+
+define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld4r(float* %A, float** %ptr) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_ld4r:
+;CHECK: ld4r.4s { v0, v1, v2, v3 }, [x0], #16
+ %ld4 = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4r.v4f32.p0f32(float* %A)
+ %tmp = getelementptr float* %A, i32 4
+ store float* %tmp, float** %ptr
+ ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4
+}
+
+define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld4r(float* %A, float** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_ld4r:
+;CHECK: ld4r.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ %ld4 = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4r.v4f32.p0f32(float* %A)
+ %tmp = getelementptr float* %A, i64 %inc
+ store float* %tmp, float** %ptr
+ ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4
+}
+
+declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4r.v4f32.p0f32(float*) nounwind readonly
+
+define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld4r(float* %A, float** %ptr) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_ld4r:
+;CHECK: ld4r.2s { v0, v1, v2, v3 }, [x0], #16
+ %ld4 = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4r.v2f32.p0f32(float* %A)
+ %tmp = getelementptr float* %A, i32 4
+ store float* %tmp, float** %ptr
+ ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4
+}
+
+define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld4r(float* %A, float** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_ld4r:
+;CHECK: ld4r.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ %ld4 = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4r.v2f32.p0f32(float* %A)
+ %tmp = getelementptr float* %A, i64 %inc
+ store float* %tmp, float** %ptr
+ ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4
+}
+
+declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4r.v2f32.p0f32(float*) nounwind readonly
+
+
+define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld4r(double* %A, double** %ptr) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_ld4r:
+;CHECK: ld4r.2d { v0, v1, v2, v3 }, [x0], #32
+ %ld4 = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4r.v2f64.p0f64(double* %A)
+ %tmp = getelementptr double* %A, i32 4
+ store double* %tmp, double** %ptr
+ ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4
+}
+
+define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld4r(double* %A, double** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_ld4r:
+;CHECK: ld4r.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ %ld4 = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4r.v2f64.p0f64(double* %A)
+ %tmp = getelementptr double* %A, i64 %inc
+ store double* %tmp, double** %ptr
+ ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4
+}
+
+declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4r.v2f64.p0f64(double*) nounwind readonly
+
+define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld4r(double* %A, double** %ptr) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_ld4r:
+;CHECK: ld4r.1d { v0, v1, v2, v3 }, [x0], #32
+ %ld4 = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4r.v1f64.p0f64(double* %A)
+ %tmp = getelementptr double* %A, i32 4
+ store double* %tmp, double** %ptr
+ ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4
+}
+
+define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld4r(double* %A, double** %ptr, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_ld4r:
+;CHECK: ld4r.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ %ld4 = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4r.v1f64.p0f64(double* %A)
+ %tmp = getelementptr double* %A, i64 %inc
+ store double* %tmp, double** %ptr
+ ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4
+}
+
+declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4r.v1f64.p0f64(double*) nounwind readonly
+
+
+define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld2lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_ld2lane:
+;CHECK: ld2.b { v0, v1 }[0], [x0], #2
+ %ld2 = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i8* %A)
+ %tmp = getelementptr i8* %A, i32 2
+ store i8* %tmp, i8** %ptr
+ ret { <16 x i8>, <16 x i8> } %ld2
+}
+
+define { <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld2lane(i8* %A, i8** %ptr, i64 %inc, <16 x i8> %B, <16 x i8> %C) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_ld2lane:
+;CHECK: ld2.b { v0, v1 }[0], [x0], x{{[0-9]+}}
+ %ld2 = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i8* %A)
+ %tmp = getelementptr i8* %A, i64 %inc
+ store i8* %tmp, i8** %ptr
+ ret { <16 x i8>, <16 x i8> } %ld2
+}
+
+declare { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
+
+
+define { <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld2lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_ld2lane:
+;CHECK: ld2.b { v0, v1 }[0], [x0], #2
+ %ld2 = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i64 0, i8* %A)
+ %tmp = getelementptr i8* %A, i32 2
+ store i8* %tmp, i8** %ptr
+ ret { <8 x i8>, <8 x i8> } %ld2
+}
+
+define { <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld2lane(i8* %A, i8** %ptr, i64 %inc, <8 x i8> %B, <8 x i8> %C) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_ld2lane:
+;CHECK: ld2.b { v0, v1 }[0], [x0], x{{[0-9]+}}
+ %ld2 = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i64 0, i8* %A)
+ %tmp = getelementptr i8* %A, i64 %inc
+ store i8* %tmp, i8** %ptr
+ ret { <8 x i8>, <8 x i8> } %ld2
+}
+
+declare { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8>, <8 x i8>, i64, i8*) nounwind readonly
+
+
+define { <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld2lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_ld2lane:
+;CHECK: ld2.h { v0, v1 }[0], [x0], #4
+ %ld2 = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i64 0, i16* %A)
+ %tmp = getelementptr i16* %A, i32 2
+ store i16* %tmp, i16** %ptr
+ ret { <8 x i16>, <8 x i16> } %ld2
+}
+
+define { <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld2lane(i16* %A, i16** %ptr, i64 %inc, <8 x i16> %B, <8 x i16> %C) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_ld2lane:
+;CHECK: ld2.h { v0, v1 }[0], [x0], x{{[0-9]+}}
+ %ld2 = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i64 0, i16* %A)
+ %tmp = getelementptr i16* %A, i64 %inc
+ store i16* %tmp, i16** %ptr
+ ret { <8 x i16>, <8 x i16> } %ld2
+}
+
+declare { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
+
+
+define { <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld2lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_ld2lane:
+;CHECK: ld2.h { v0, v1 }[0], [x0], #4
+ %ld2 = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i64 0, i16* %A)
+ %tmp = getelementptr i16* %A, i32 2
+ store i16* %tmp, i16** %ptr
+ ret { <4 x i16>, <4 x i16> } %ld2
+}
+
+define { <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld2lane(i16* %A, i16** %ptr, i64 %inc, <4 x i16> %B, <4 x i16> %C) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_ld2lane:
+;CHECK: ld2.h { v0, v1 }[0], [x0], x{{[0-9]+}}
+ %ld2 = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i64 0, i16* %A)
+ %tmp = getelementptr i16* %A, i64 %inc
+ store i16* %tmp, i16** %ptr
+ ret { <4 x i16>, <4 x i16> } %ld2
+}
+
+declare { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i16(<4 x i16>, <4 x i16>, i64, i16*) nounwind readonly
+
+
+define { <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld2lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_ld2lane:
+;CHECK: ld2.s { v0, v1 }[0], [x0], #8
+ %ld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i64 0, i32* %A)
+ %tmp = getelementptr i32* %A, i32 2
+ store i32* %tmp, i32** %ptr
+ ret { <4 x i32>, <4 x i32> } %ld2
+}
+
+define { <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld2lane(i32* %A, i32** %ptr, i64 %inc, <4 x i32> %B, <4 x i32> %C) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_ld2lane:
+;CHECK: ld2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
+ %ld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i64 0, i32* %A)
+ %tmp = getelementptr i32* %A, i64 %inc
+ store i32* %tmp, i32** %ptr
+ ret { <4 x i32>, <4 x i32> } %ld2
+}
+
+declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
+
+
+define { <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld2lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_ld2lane:
+;CHECK: ld2.s { v0, v1 }[0], [x0], #8
+ %ld2 = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i64 0, i32* %A)
+ %tmp = getelementptr i32* %A, i32 2
+ store i32* %tmp, i32** %ptr
+ ret { <2 x i32>, <2 x i32> } %ld2
+}
+
+define { <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld2lane(i32* %A, i32** %ptr, i64 %inc, <2 x i32> %B, <2 x i32> %C) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_ld2lane:
+;CHECK: ld2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
+ %ld2 = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i64 0, i32* %A)
+ %tmp = getelementptr i32* %A, i64 %inc
+ store i32* %tmp, i32** %ptr
+ ret { <2 x i32>, <2 x i32> } %ld2
+}
+
+declare { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0i32(<2 x i32>, <2 x i32>, i64, i32*) nounwind readonly
+
+
+define { <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld2lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_ld2lane:
+;CHECK: ld2.d { v0, v1 }[0], [x0], #16
+ %ld2 = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64 0, i64* %A)
+ %tmp = getelementptr i64* %A, i32 2
+ store i64* %tmp, i64** %ptr
+ ret { <2 x i64>, <2 x i64> } %ld2
+}
+
+define { <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld2lane(i64* %A, i64** %ptr, i64 %inc, <2 x i64> %B, <2 x i64> %C) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_ld2lane:
+;CHECK: ld2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
+ %ld2 = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64 0, i64* %A)
+ %tmp = getelementptr i64* %A, i64 %inc
+ store i64* %tmp, i64** %ptr
+ ret { <2 x i64>, <2 x i64> } %ld2
+}
+
+declare { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
+
+
+define { <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld2lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_ld2lane:
+;CHECK: ld2.d { v0, v1 }[0], [x0], #16
+ %ld2 = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64 0, i64* %A)
+ %tmp = getelementptr i64* %A, i32 2
+ store i64* %tmp, i64** %ptr
+ ret { <1 x i64>, <1 x i64> } %ld2
+}
+
+define { <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld2lane(i64* %A, i64** %ptr, i64 %inc, <1 x i64> %B, <1 x i64> %C) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_ld2lane:
+;CHECK: ld2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
+ %ld2 = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64 0, i64* %A)
+ %tmp = getelementptr i64* %A, i64 %inc
+ store i64* %tmp, i64** %ptr
+ ret { <1 x i64>, <1 x i64> } %ld2
+}
+
+declare { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0i64(<1 x i64>, <1 x i64>, i64, i64*) nounwind readonly
+
+
+define { <4 x float>, <4 x float> } @test_v4f32_post_imm_ld2lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_ld2lane:
+;CHECK: ld2.s { v0, v1 }[0], [x0], #8
+ %ld2 = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, i64 0, float* %A)
+ %tmp = getelementptr float* %A, i32 2
+ store float* %tmp, float** %ptr
+ ret { <4 x float>, <4 x float> } %ld2
+}
+
+define { <4 x float>, <4 x float> } @test_v4f32_post_reg_ld2lane(float* %A, float** %ptr, i64 %inc, <4 x float> %B, <4 x float> %C) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_ld2lane:
+;CHECK: ld2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
+ %ld2 = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, i64 0, float* %A)
+ %tmp = getelementptr float* %A, i64 %inc
+ store float* %tmp, float** %ptr
+ ret { <4 x float>, <4 x float> } %ld2
+}
+
+declare { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2lane.v4f32.p0f32(<4 x float>, <4 x float>, i64, float*) nounwind readonly
+
+
+define { <2 x float>, <2 x float> } @test_v2f32_post_imm_ld2lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_ld2lane:
+;CHECK: ld2.s { v0, v1 }[0], [x0], #8
+ %ld2 = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, i64 0, float* %A)
+ %tmp = getelementptr float* %A, i32 2
+ store float* %tmp, float** %ptr
+ ret { <2 x float>, <2 x float> } %ld2
+}
+
+define { <2 x float>, <2 x float> } @test_v2f32_post_reg_ld2lane(float* %A, float** %ptr, i64 %inc, <2 x float> %B, <2 x float> %C) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_ld2lane:
+;CHECK: ld2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
+ %ld2 = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, i64 0, float* %A)
+ %tmp = getelementptr float* %A, i64 %inc
+ store float* %tmp, float** %ptr
+ ret { <2 x float>, <2 x float> } %ld2
+}
+
+declare { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2lane.v2f32.p0f32(<2 x float>, <2 x float>, i64, float*) nounwind readonly
+
+
+define { <2 x double>, <2 x double> } @test_v2f64_post_imm_ld2lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_ld2lane:
+;CHECK: ld2.d { v0, v1 }[0], [x0], #16
+ %ld2 = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, i64 0, double* %A)
+ %tmp = getelementptr double* %A, i32 2
+ store double* %tmp, double** %ptr
+ ret { <2 x double>, <2 x double> } %ld2
+}
+
+define { <2 x double>, <2 x double> } @test_v2f64_post_reg_ld2lane(double* %A, double** %ptr, i64 %inc, <2 x double> %B, <2 x double> %C) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_ld2lane:
+;CHECK: ld2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
+ %ld2 = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, i64 0, double* %A)
+ %tmp = getelementptr double* %A, i64 %inc
+ store double* %tmp, double** %ptr
+ ret { <2 x double>, <2 x double> } %ld2
+}
+
+declare { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2lane.v2f64.p0f64(<2 x double>, <2 x double>, i64, double*) nounwind readonly
+
+
+define { <1 x double>, <1 x double> } @test_v1f64_post_imm_ld2lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_ld2lane:
+;CHECK: ld2.d { v0, v1 }[0], [x0], #16
+ %ld2 = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, i64 0, double* %A)
+ %tmp = getelementptr double* %A, i32 2
+ store double* %tmp, double** %ptr
+ ret { <1 x double>, <1 x double> } %ld2
+}
+
+define { <1 x double>, <1 x double> } @test_v1f64_post_reg_ld2lane(double* %A, double** %ptr, i64 %inc, <1 x double> %B, <1 x double> %C) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_ld2lane:
+;CHECK: ld2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
+ %ld2 = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, i64 0, double* %A)
+ %tmp = getelementptr double* %A, i64 %inc
+ store double* %tmp, double** %ptr
+ ret { <1 x double>, <1 x double> } %ld2
+}
+
+declare { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2lane.v1f64.p0f64(<1 x double>, <1 x double>, i64, double*) nounwind readonly
+
+
+define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld3lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_ld3lane:
+;CHECK: ld3.b { v0, v1, v2 }[0], [x0], #3
+ %ld3 = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 0, i8* %A)
+ %tmp = getelementptr i8* %A, i32 3
+ store i8* %tmp, i8** %ptr
+ ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3
+}
+
+define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld3lane(i8* %A, i8** %ptr, i64 %inc, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_ld3lane:
+;CHECK: ld3.b { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+ %ld3 = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 0, i8* %A)
+ %tmp = getelementptr i8* %A, i64 %inc
+ store i8* %tmp, i8** %ptr
+ ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3
+}
+
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
+
+
+define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld3lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_ld3lane:
+;CHECK: ld3.b { v0, v1, v2 }[0], [x0], #3
+ %ld3 = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 0, i8* %A)
+ %tmp = getelementptr i8* %A, i32 3
+ store i8* %tmp, i8** %ptr
+ ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3
+}
+
+define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld3lane(i8* %A, i8** %ptr, i64 %inc, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_ld3lane:
+;CHECK: ld3.b { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+ %ld3 = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 0, i8* %A)
+ %tmp = getelementptr i8* %A, i64 %inc
+ store i8* %tmp, i8** %ptr
+ ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3
+}
+
+declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i64, i8*) nounwind readonly
+
+
+define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld3lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_ld3lane:
+;CHECK: ld3.h { v0, v1, v2 }[0], [x0], #6
+ %ld3 = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 0, i16* %A)
+ %tmp = getelementptr i16* %A, i32 3
+ store i16* %tmp, i16** %ptr
+ ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3
+}
+
+define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld3lane(i16* %A, i16** %ptr, i64 %inc, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_ld3lane:
+;CHECK: ld3.h { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+ %ld3 = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 0, i16* %A)
+ %tmp = getelementptr i16* %A, i64 %inc
+ store i16* %tmp, i16** %ptr
+ ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3
+}
+
+declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
+
+
+define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld3lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_ld3lane:
+;CHECK: ld3.h { v0, v1, v2 }[0], [x0], #6
+ %ld3 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 0, i16* %A)
+ %tmp = getelementptr i16* %A, i32 3
+ store i16* %tmp, i16** %ptr
+ ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3
+}
+
+define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld3lane(i16* %A, i16** %ptr, i64 %inc, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_ld3lane:
+;CHECK: ld3.h { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+ %ld3 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 0, i16* %A)
+ %tmp = getelementptr i16* %A, i64 %inc
+ store i16* %tmp, i16** %ptr
+ ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3
+}
+
+declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i64, i16*) nounwind readonly
+
+
+define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld3lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_ld3lane:
+;CHECK: ld3.s { v0, v1, v2 }[0], [x0], #12
+ %ld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 0, i32* %A)
+ %tmp = getelementptr i32* %A, i32 3
+ store i32* %tmp, i32** %ptr
+ ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3
+}
+
+define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld3lane(i32* %A, i32** %ptr, i64 %inc, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_ld3lane:
+;CHECK: ld3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+ %ld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 0, i32* %A)
+ %tmp = getelementptr i32* %A, i64 %inc
+ store i32* %tmp, i32** %ptr
+ ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3
+}
+
+declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
+
+
+define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld3lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_ld3lane:
+;CHECK: ld3.s { v0, v1, v2 }[0], [x0], #12
+ %ld3 = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 0, i32* %A)
+ %tmp = getelementptr i32* %A, i32 3
+ store i32* %tmp, i32** %ptr
+ ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3
+}
+
+define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld3lane(i32* %A, i32** %ptr, i64 %inc, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_ld3lane:
+;CHECK: ld3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+ %ld3 = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 0, i32* %A)
+ %tmp = getelementptr i32* %A, i64 %inc
+ store i32* %tmp, i32** %ptr
+ ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3
+}
+
+declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i64, i32*) nounwind readonly
+
+
+define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld3lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_ld3lane:
+;CHECK: ld3.d { v0, v1, v2 }[0], [x0], #24
+ %ld3 = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 0, i64* %A)
+ %tmp = getelementptr i64* %A, i32 3
+ store i64* %tmp, i64** %ptr
+ ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3
+}
+
+define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld3lane(i64* %A, i64** %ptr, i64 %inc, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_ld3lane:
+;CHECK: ld3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+ %ld3 = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 0, i64* %A)
+ %tmp = getelementptr i64* %A, i64 %inc
+ store i64* %tmp, i64** %ptr
+ ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3
+}
+
+declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
+
+
+define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld3lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_ld3lane:
+;CHECK: ld3.d { v0, v1, v2 }[0], [x0], #24
+ %ld3 = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 0, i64* %A)
+ %tmp = getelementptr i64* %A, i32 3
+ store i64* %tmp, i64** %ptr
+ ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3
+}
+
+define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld3lane(i64* %A, i64** %ptr, i64 %inc, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_ld3lane:
+;CHECK: ld3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+ %ld3 = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 0, i64* %A)
+ %tmp = getelementptr i64* %A, i64 %inc
+ store i64* %tmp, i64** %ptr
+ ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3
+}
+
+declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64, i64*) nounwind readonly
+
+
+define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld3lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_ld3lane:
+;CHECK: ld3.s { v0, v1, v2 }[0], [x0], #12
+ %ld3 = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, i64 0, float* %A)
+ %tmp = getelementptr float* %A, i32 3
+ store float* %tmp, float** %ptr
+ ret { <4 x float>, <4 x float>, <4 x float> } %ld3
+}
+
+define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld3lane(float* %A, float** %ptr, i64 %inc, <4 x float> %B, <4 x float> %C, <4 x float> %D) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_ld3lane:
+;CHECK: ld3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+ %ld3 = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, i64 0, float* %A)
+ %tmp = getelementptr float* %A, i64 %inc
+ store float* %tmp, float** %ptr
+ ret { <4 x float>, <4 x float>, <4 x float> } %ld3
+}
+
+declare { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, i64, float*) nounwind readonly
+
+
+define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld3lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_ld3lane:
+;CHECK: ld3.s { v0, v1, v2 }[0], [x0], #12
+ %ld3 = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, i64 0, float* %A)
+ %tmp = getelementptr float* %A, i32 3
+ store float* %tmp, float** %ptr
+ ret { <2 x float>, <2 x float>, <2 x float> } %ld3
+}
+
+define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld3lane(float* %A, float** %ptr, i64 %inc, <2 x float> %B, <2 x float> %C, <2 x float> %D) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_ld3lane:
+;CHECK: ld3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+ %ld3 = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, i64 0, float* %A)
+ %tmp = getelementptr float* %A, i64 %inc
+ store float* %tmp, float** %ptr
+ ret { <2 x float>, <2 x float>, <2 x float> } %ld3
+}
+
+declare { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, i64, float*) nounwind readonly
+
+
+define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld3lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_ld3lane:
+;CHECK: ld3.d { v0, v1, v2 }[0], [x0], #24
+ %ld3 = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, i64 0, double* %A)
+ %tmp = getelementptr double* %A, i32 3
+ store double* %tmp, double** %ptr
+ ret { <2 x double>, <2 x double>, <2 x double> } %ld3
+}
+
+define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld3lane(double* %A, double** %ptr, i64 %inc, <2 x double> %B, <2 x double> %C, <2 x double> %D) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_ld3lane:
+;CHECK: ld3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+ %ld3 = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, i64 0, double* %A)
+ %tmp = getelementptr double* %A, i64 %inc
+ store double* %tmp, double** %ptr
+ ret { <2 x double>, <2 x double>, <2 x double> } %ld3
+}
+
+declare { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3lane.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, i64, double*) nounwind readonly
+
+
+define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld3lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_ld3lane:
+;CHECK: ld3.d { v0, v1, v2 }[0], [x0], #24
+ %ld3 = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, i64 0, double* %A)
+ %tmp = getelementptr double* %A, i32 3
+ store double* %tmp, double** %ptr
+ ret { <1 x double>, <1 x double>, <1 x double> } %ld3
+}
+
+define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld3lane(double* %A, double** %ptr, i64 %inc, <1 x double> %B, <1 x double> %C, <1 x double> %D) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_ld3lane:
+;CHECK: ld3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+ %ld3 = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, i64 0, double* %A)
+ %tmp = getelementptr double* %A, i64 %inc
+ store double* %tmp, double** %ptr
+ ret { <1 x double>, <1 x double>, <1 x double> } %ld3
+}
+
+declare { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3lane.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, i64, double*) nounwind readonly
+
+
+define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld4lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_ld4lane:
+;CHECK: ld4.b { v0, v1, v2, v3 }[0], [x0], #4
+ %ld4 = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 0, i8* %A)
+ %tmp = getelementptr i8* %A, i32 4
+ store i8* %tmp, i8** %ptr
+ ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4
+}
+
+define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld4lane(i8* %A, i8** %ptr, i64 %inc, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_ld4lane:
+;CHECK: ld4.b { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+ %ld4 = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 0, i8* %A)
+ %tmp = getelementptr i8* %A, i64 %inc
+ store i8* %tmp, i8** %ptr
+ ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4
+}
+
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
+
+
+define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld4lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_ld4lane:
+;CHECK: ld4.b { v0, v1, v2, v3 }[0], [x0], #4
+ %ld4 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 0, i8* %A)
+ %tmp = getelementptr i8* %A, i32 4
+ store i8* %tmp, i8** %ptr
+ ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4
+}
+
+define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld4lane(i8* %A, i8** %ptr, i64 %inc, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_ld4lane:
+;CHECK: ld4.b { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+ %ld4 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 0, i8* %A)
+ %tmp = getelementptr i8* %A, i64 %inc
+ store i8* %tmp, i8** %ptr
+ ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4
+}
+
+declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i64, i8*) nounwind readonly
+
+
+define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld4lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_ld4lane:
+;CHECK: ld4.h { v0, v1, v2, v3 }[0], [x0], #8
+ %ld4 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 0, i16* %A)
+ %tmp = getelementptr i16* %A, i32 4
+ store i16* %tmp, i16** %ptr
+ ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4
+}
+
+define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld4lane(i16* %A, i16** %ptr, i64 %inc, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_ld4lane:
+;CHECK: ld4.h { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+ %ld4 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 0, i16* %A)
+ %tmp = getelementptr i16* %A, i64 %inc
+ store i16* %tmp, i16** %ptr
+ ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4
+}
+
+declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
+
+
+define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld4lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_ld4lane:
+;CHECK: ld4.h { v0, v1, v2, v3 }[0], [x0], #8
+ %ld4 = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 0, i16* %A)
+ %tmp = getelementptr i16* %A, i32 4
+ store i16* %tmp, i16** %ptr
+ ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4
+}
+
+define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld4lane(i16* %A, i16** %ptr, i64 %inc, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_ld4lane:
+;CHECK: ld4.h { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+ %ld4 = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 0, i16* %A)
+ %tmp = getelementptr i16* %A, i64 %inc
+ store i16* %tmp, i16** %ptr
+ ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4
+}
+
+declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i64, i16*) nounwind readonly
+
+
+define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld4lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_ld4lane:
+;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], #16
+ %ld4 = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 0, i32* %A)
+ %tmp = getelementptr i32* %A, i32 4
+ store i32* %tmp, i32** %ptr
+ ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4
+}
+
+define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld4lane(i32* %A, i32** %ptr, i64 %inc, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_ld4lane:
+;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+ %ld4 = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 0, i32* %A)
+ %tmp = getelementptr i32* %A, i64 %inc
+ store i32* %tmp, i32** %ptr
+ ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4
+}
+
+declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
+
+
+define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld4lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_ld4lane:
+;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], #16
+ %ld4 = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 0, i32* %A)
+ %tmp = getelementptr i32* %A, i32 4
+ store i32* %tmp, i32** %ptr
+ ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4
+}
+
+define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld4lane(i32* %A, i32** %ptr, i64 %inc, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_ld4lane:
+;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+ %ld4 = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 0, i32* %A)
+ %tmp = getelementptr i32* %A, i64 %inc
+ store i32* %tmp, i32** %ptr
+ ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4
+}
+
+declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i64, i32*) nounwind readonly
+
+
+define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld4lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_ld4lane:
+;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], #32
+ %ld4 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 0, i64* %A)
+ %tmp = getelementptr i64* %A, i32 4
+ store i64* %tmp, i64** %ptr
+ ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4
+}
+
+define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld4lane(i64* %A, i64** %ptr, i64 %inc, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_ld4lane:
+;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+ %ld4 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 0, i64* %A)
+ %tmp = getelementptr i64* %A, i64 %inc
+ store i64* %tmp, i64** %ptr
+ ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4
+}
+
+declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
+
+
+define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld4lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_ld4lane:
+;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], #32
+ %ld4 = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 0, i64* %A)
+ %tmp = getelementptr i64* %A, i32 4
+ store i64* %tmp, i64** %ptr
+ ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4
+}
+
+define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld4lane(i64* %A, i64** %ptr, i64 %inc, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_ld4lane:
+;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+ %ld4 = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 0, i64* %A)
+ %tmp = getelementptr i64* %A, i64 %inc
+ store i64* %tmp, i64** %ptr
+ ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4
+}
+
+declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64, i64*) nounwind readonly
+
+
+define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld4lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_ld4lane:
+;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], #16
+ %ld4 = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 0, float* %A)
+ %tmp = getelementptr float* %A, i32 4
+ store float* %tmp, float** %ptr
+ ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4
+}
+
+define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld4lane(float* %A, float** %ptr, i64 %inc, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_ld4lane:
+;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+ %ld4 = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 0, float* %A)
+ %tmp = getelementptr float* %A, i64 %inc
+ store float* %tmp, float** %ptr
+ ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4
+}
+
+declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, i64, float*) nounwind readonly
+
+
+define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld4lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_ld4lane:
+;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], #16
+ %ld4 = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 0, float* %A)
+ %tmp = getelementptr float* %A, i32 4
+ store float* %tmp, float** %ptr
+ ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4
+}
+
+define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld4lane(float* %A, float** %ptr, i64 %inc, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_ld4lane:
+;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+ %ld4 = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 0, float* %A)
+ %tmp = getelementptr float* %A, i64 %inc
+ store float* %tmp, float** %ptr
+ ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4
+}
+
+declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, i64, float*) nounwind readonly
+
+
+define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld4lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_ld4lane:
+;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], #32
+ %ld4 = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 0, double* %A)
+ %tmp = getelementptr double* %A, i32 4
+ store double* %tmp, double** %ptr
+ ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4
+}
+
+define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld4lane(double* %A, double** %ptr, i64 %inc, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_ld4lane:
+;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+ %ld4 = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 0, double* %A)
+ %tmp = getelementptr double* %A, i64 %inc
+ store double* %tmp, double** %ptr
+ ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4
+}
+
+declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4lane.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, <2 x double>, i64, double*) nounwind readonly
+
+
+define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld4lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_ld4lane:
+;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], #32
+ %ld4 = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 0, double* %A)
+ %tmp = getelementptr double* %A, i32 4
+ store double* %tmp, double** %ptr
+ ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4
+}
+
+define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld4lane(double* %A, double** %ptr, i64 %inc, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_ld4lane:
+;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+ %ld4 = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 0, double* %A)
+ %tmp = getelementptr double* %A, i64 %inc
+ store double* %tmp, double** %ptr
+ ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4
+}
+
+declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4lane.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, i64, double*) nounwind readonly
+
+
+define i8* @test_v16i8_post_imm_st2(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_st2:
+;CHECK: st2.16b { v0, v1 }, [x0], #32
+ call void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i8* %A)
+ %tmp = getelementptr i8* %A, i32 32
+ ret i8* %tmp
+}
+
+define i8* @test_v16i8_post_reg_st2(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_st2:
+;CHECK: st2.16b { v0, v1 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i8* %A)
+ %tmp = getelementptr i8* %A, i64 %inc
+ ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*)
+
+
+define i8* @test_v8i8_post_imm_st2(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_st2:
+;CHECK: st2.8b { v0, v1 }, [x0], #16
+ call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i8* %A)
+ %tmp = getelementptr i8* %A, i32 16
+ ret i8* %tmp
+}
+
+define i8* @test_v8i8_post_reg_st2(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_st2:
+;CHECK: st2.8b { v0, v1 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i8* %A)
+ %tmp = getelementptr i8* %A, i64 %inc
+ ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*)
+
+
+define i16* @test_v8i16_post_imm_st2(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_st2:
+;CHECK: st2.8h { v0, v1 }, [x0], #32
+ call void @llvm.aarch64.neon.st2.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i16* %A)
+ %tmp = getelementptr i16* %A, i32 16
+ ret i16* %tmp
+}
+
+define i16* @test_v8i16_post_reg_st2(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_st2:
+;CHECK: st2.8h { v0, v1 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st2.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i16* %A)
+ %tmp = getelementptr i16* %A, i64 %inc
+ ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*)
+
+
+define i16* @test_v4i16_post_imm_st2(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_st2:
+;CHECK: st2.4h { v0, v1 }, [x0], #16
+ call void @llvm.aarch64.neon.st2.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i16* %A)
+ %tmp = getelementptr i16* %A, i32 8
+ ret i16* %tmp
+}
+
+define i16* @test_v4i16_post_reg_st2(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_st2:
+;CHECK: st2.4h { v0, v1 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st2.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i16* %A)
+ %tmp = getelementptr i16* %A, i64 %inc
+ ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*)
+
+
+define i32* @test_v4i32_post_imm_st2(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_st2:
+;CHECK: st2.4s { v0, v1 }, [x0], #32
+ call void @llvm.aarch64.neon.st2.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i32* %A)
+ %tmp = getelementptr i32* %A, i32 8
+ ret i32* %tmp
+}
+
+define i32* @test_v4i32_post_reg_st2(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_st2:
+;CHECK: st2.4s { v0, v1 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st2.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i32* %A)
+ %tmp = getelementptr i32* %A, i64 %inc
+ ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*)
+
+
+define i32* @test_v2i32_post_imm_st2(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_st2:
+;CHECK: st2.2s { v0, v1 }, [x0], #16
+ call void @llvm.aarch64.neon.st2.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i32* %A)
+ %tmp = getelementptr i32* %A, i32 4
+ ret i32* %tmp
+}
+
+define i32* @test_v2i32_post_reg_st2(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_st2:
+;CHECK: st2.2s { v0, v1 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st2.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i32* %A)
+ %tmp = getelementptr i32* %A, i64 %inc
+ ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*)
+
+
+define i64* @test_v2i64_post_imm_st2(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_st2:
+;CHECK: st2.2d { v0, v1 }, [x0], #32
+ call void @llvm.aarch64.neon.st2.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64* %A)
+ %tmp = getelementptr i64* %A, i64 4
+ ret i64* %tmp
+}
+
+define i64* @test_v2i64_post_reg_st2(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_st2:
+;CHECK: st2.2d { v0, v1 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st2.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64* %A)
+ %tmp = getelementptr i64* %A, i64 %inc
+ ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*)
+
+
+define i64* @test_v1i64_post_imm_st2(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_st2:
+;CHECK: st1.1d { v0, v1 }, [x0], #16
+ call void @llvm.aarch64.neon.st2.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64* %A)
+ %tmp = getelementptr i64* %A, i64 2
+ ret i64* %tmp
+}
+
+define i64* @test_v1i64_post_reg_st2(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_st2:
+;CHECK: st1.1d { v0, v1 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st2.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64* %A)
+ %tmp = getelementptr i64* %A, i64 %inc
+ ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*)
+
+
+define float* @test_v4f32_post_imm_st2(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_st2:
+;CHECK: st2.4s { v0, v1 }, [x0], #32
+ call void @llvm.aarch64.neon.st2.v4f32.p0f32(<4 x float> %B, <4 x float> %C, float* %A)
+ %tmp = getelementptr float* %A, i32 8
+ ret float* %tmp
+}
+
+define float* @test_v4f32_post_reg_st2(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_st2:
+;CHECK: st2.4s { v0, v1 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st2.v4f32.p0f32(<4 x float> %B, <4 x float> %C, float* %A)
+ %tmp = getelementptr float* %A, i64 %inc
+ ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2.v4f32.p0f32(<4 x float>, <4 x float>, float*)
+
+
+define float* @test_v2f32_post_imm_st2(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_st2:
+;CHECK: st2.2s { v0, v1 }, [x0], #16
+ call void @llvm.aarch64.neon.st2.v2f32.p0f32(<2 x float> %B, <2 x float> %C, float* %A)
+ %tmp = getelementptr float* %A, i32 4
+ ret float* %tmp
+}
+
+define float* @test_v2f32_post_reg_st2(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_st2:
+;CHECK: st2.2s { v0, v1 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st2.v2f32.p0f32(<2 x float> %B, <2 x float> %C, float* %A)
+ %tmp = getelementptr float* %A, i64 %inc
+ ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2.v2f32.p0f32(<2 x float>, <2 x float>, float*)
+
+
+define double* @test_v2f64_post_imm_st2(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_st2:
+;CHECK: st2.2d { v0, v1 }, [x0], #32
+ call void @llvm.aarch64.neon.st2.v2f64.p0f64(<2 x double> %B, <2 x double> %C, double* %A)
+ %tmp = getelementptr double* %A, i64 4
+ ret double* %tmp
+}
+
+define double* @test_v2f64_post_reg_st2(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_st2:
+;CHECK: st2.2d { v0, v1 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st2.v2f64.p0f64(<2 x double> %B, <2 x double> %C, double* %A)
+ %tmp = getelementptr double* %A, i64 %inc
+ ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2.v2f64.p0f64(<2 x double>, <2 x double>, double*)
+
+
+define double* @test_v1f64_post_imm_st2(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_st2:
+;CHECK: st1.1d { v0, v1 }, [x0], #16
+ call void @llvm.aarch64.neon.st2.v1f64.p0f64(<1 x double> %B, <1 x double> %C, double* %A)
+ %tmp = getelementptr double* %A, i64 2
+ ret double* %tmp
+}
+
+define double* @test_v1f64_post_reg_st2(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_st2:
+;CHECK: st1.1d { v0, v1 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st2.v1f64.p0f64(<1 x double> %B, <1 x double> %C, double* %A)
+ %tmp = getelementptr double* %A, i64 %inc
+ ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2.v1f64.p0f64(<1 x double>, <1 x double>, double*)
+
+
+define i8* @test_v16i8_post_imm_st3(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_st3:
+;CHECK: st3.16b { v0, v1, v2 }, [x0], #48
+ call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %A)
+ %tmp = getelementptr i8* %A, i32 48
+ ret i8* %tmp
+}
+
+define i8* @test_v16i8_post_reg_st3(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_st3:
+;CHECK: st3.16b { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %A)
+ %tmp = getelementptr i8* %A, i64 %inc
+ ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*)
+
+
+define i8* @test_v8i8_post_imm_st3(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_st3:
+;CHECK: st3.8b { v0, v1, v2 }, [x0], #24
+ call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %A)
+ %tmp = getelementptr i8* %A, i32 24
+ ret i8* %tmp
+}
+
+define i8* @test_v8i8_post_reg_st3(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_st3:
+;CHECK: st3.8b { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %A)
+ %tmp = getelementptr i8* %A, i64 %inc
+ ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*)
+
+
+define i16* @test_v8i16_post_imm_st3(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_st3:
+;CHECK: st3.8h { v0, v1, v2 }, [x0], #48
+ call void @llvm.aarch64.neon.st3.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %A)
+ %tmp = getelementptr i16* %A, i32 24
+ ret i16* %tmp
+}
+
+define i16* @test_v8i16_post_reg_st3(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_st3:
+;CHECK: st3.8h { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st3.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %A)
+ %tmp = getelementptr i16* %A, i64 %inc
+ ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*)
+
+
+define i16* @test_v4i16_post_imm_st3(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_st3:
+;CHECK: st3.4h { v0, v1, v2 }, [x0], #24
+ call void @llvm.aarch64.neon.st3.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %A)
+ %tmp = getelementptr i16* %A, i32 12
+ ret i16* %tmp
+}
+
+define i16* @test_v4i16_post_reg_st3(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_st3:
+;CHECK: st3.4h { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st3.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %A)
+ %tmp = getelementptr i16* %A, i64 %inc
+ ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*)
+
+
+define i32* @test_v4i32_post_imm_st3(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_st3:
+;CHECK: st3.4s { v0, v1, v2 }, [x0], #48
+ call void @llvm.aarch64.neon.st3.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %A)
+ %tmp = getelementptr i32* %A, i32 12
+ ret i32* %tmp
+}
+
+define i32* @test_v4i32_post_reg_st3(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_st3:
+;CHECK: st3.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st3.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %A)
+ %tmp = getelementptr i32* %A, i64 %inc
+ ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*)
+
+
+define i32* @test_v2i32_post_imm_st3(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_st3:
+;CHECK: st3.2s { v0, v1, v2 }, [x0], #24
+ call void @llvm.aarch64.neon.st3.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %A)
+ %tmp = getelementptr i32* %A, i32 6
+ ret i32* %tmp
+}
+
+define i32* @test_v2i32_post_reg_st3(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_st3:
+;CHECK: st3.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st3.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %A)
+ %tmp = getelementptr i32* %A, i64 %inc
+ ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*)
+
+
+define i64* @test_v2i64_post_imm_st3(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_st3:
+;CHECK: st3.2d { v0, v1, v2 }, [x0], #48
+ call void @llvm.aarch64.neon.st3.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %A)
+ %tmp = getelementptr i64* %A, i64 6
+ ret i64* %tmp
+}
+
+define i64* @test_v2i64_post_reg_st3(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_st3:
+;CHECK: st3.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st3.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %A)
+ %tmp = getelementptr i64* %A, i64 %inc
+ ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*)
+
+
+define i64* @test_v1i64_post_imm_st3(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_st3:
+;CHECK: st1.1d { v0, v1, v2 }, [x0], #24
+ call void @llvm.aarch64.neon.st3.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %A)
+ %tmp = getelementptr i64* %A, i64 3
+ ret i64* %tmp
+}
+
+define i64* @test_v1i64_post_reg_st3(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_st3:
+;CHECK: st1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st3.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %A)
+ %tmp = getelementptr i64* %A, i64 %inc
+ ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*)
+
+
+define float* @test_v4f32_post_imm_st3(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_st3:
+;CHECK: st3.4s { v0, v1, v2 }, [x0], #48
+ call void @llvm.aarch64.neon.st3.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, float* %A)
+ %tmp = getelementptr float* %A, i32 12
+ ret float* %tmp
+}
+
+define float* @test_v4f32_post_reg_st3(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_st3:
+;CHECK: st3.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st3.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, float* %A)
+ %tmp = getelementptr float* %A, i64 %inc
+ ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, float*)
+
+
+define float* @test_v2f32_post_imm_st3(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_st3:
+;CHECK: st3.2s { v0, v1, v2 }, [x0], #24
+ call void @llvm.aarch64.neon.st3.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, float* %A)
+ %tmp = getelementptr float* %A, i32 6
+ ret float* %tmp
+}
+
+define float* @test_v2f32_post_reg_st3(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_st3:
+;CHECK: st3.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st3.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, float* %A)
+ %tmp = getelementptr float* %A, i64 %inc
+ ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, float*)
+
+
+define double* @test_v2f64_post_imm_st3(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_st3:
+;CHECK: st3.2d { v0, v1, v2 }, [x0], #48
+ call void @llvm.aarch64.neon.st3.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, double* %A)
+ %tmp = getelementptr double* %A, i64 6
+ ret double* %tmp
+}
+
+define double* @test_v2f64_post_reg_st3(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_st3:
+;CHECK: st3.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st3.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, double* %A)
+ %tmp = getelementptr double* %A, i64 %inc
+ ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, double*)
+
+
+define double* @test_v1f64_post_imm_st3(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_st3:
+;CHECK: st1.1d { v0, v1, v2 }, [x0], #24
+ call void @llvm.aarch64.neon.st3.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, double* %A)
+ %tmp = getelementptr double* %A, i64 3
+ ret double* %tmp
+}
+
+define double* @test_v1f64_post_reg_st3(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_st3:
+;CHECK: st1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st3.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, double* %A)
+ %tmp = getelementptr double* %A, i64 %inc
+ ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, double*)
+
+
+define i8* @test_v16i8_post_imm_st4(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_st4:
+;CHECK: st4.16b { v0, v1, v2, v3 }, [x0], #64
+ call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i8* %A)
+ %tmp = getelementptr i8* %A, i32 64
+ ret i8* %tmp
+}
+
+define i8* @test_v16i8_post_reg_st4(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_st4:
+;CHECK: st4.16b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i8* %A)
+ %tmp = getelementptr i8* %A, i64 %inc
+ ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*)
+
+
+define i8* @test_v8i8_post_imm_st4(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_st4:
+;CHECK: st4.8b { v0, v1, v2, v3 }, [x0], #32
+ call void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i8* %A)
+ %tmp = getelementptr i8* %A, i32 32
+ ret i8* %tmp
+}
+
+define i8* @test_v8i8_post_reg_st4(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_st4:
+;CHECK: st4.8b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i8* %A)
+ %tmp = getelementptr i8* %A, i64 %inc
+ ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*)
+
+
+define i16* @test_v8i16_post_imm_st4(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_st4:
+;CHECK: st4.8h { v0, v1, v2, v3 }, [x0], #64
+ call void @llvm.aarch64.neon.st4.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i16* %A)
+ %tmp = getelementptr i16* %A, i32 32
+ ret i16* %tmp
+}
+
+define i16* @test_v8i16_post_reg_st4(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_st4:
+;CHECK: st4.8h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st4.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i16* %A)
+ %tmp = getelementptr i16* %A, i64 %inc
+ ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*)
+
+
+define i16* @test_v4i16_post_imm_st4(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_st4:
+;CHECK: st4.4h { v0, v1, v2, v3 }, [x0], #32
+ call void @llvm.aarch64.neon.st4.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i16* %A)
+ %tmp = getelementptr i16* %A, i32 16
+ ret i16* %tmp
+}
+
+define i16* @test_v4i16_post_reg_st4(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_st4:
+;CHECK: st4.4h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st4.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i16* %A)
+ %tmp = getelementptr i16* %A, i64 %inc
+ ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>,<4 x i16>, i16*)
+
+
+define i32* @test_v4i32_post_imm_st4(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_st4:
+;CHECK: st4.4s { v0, v1, v2, v3 }, [x0], #64
+ call void @llvm.aarch64.neon.st4.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i32* %A)
+ %tmp = getelementptr i32* %A, i32 16
+ ret i32* %tmp
+}
+
+define i32* @test_v4i32_post_reg_st4(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_st4:
+;CHECK: st4.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st4.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i32* %A)
+ %tmp = getelementptr i32* %A, i64 %inc
+ ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>,<4 x i32>, i32*)
+
+
+define i32* @test_v2i32_post_imm_st4(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_st4:
+;CHECK: st4.2s { v0, v1, v2, v3 }, [x0], #32
+ call void @llvm.aarch64.neon.st4.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i32* %A)
+ %tmp = getelementptr i32* %A, i32 8
+ ret i32* %tmp
+}
+
+define i32* @test_v2i32_post_reg_st4(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_st4:
+;CHECK: st4.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st4.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i32* %A)
+ %tmp = getelementptr i32* %A, i64 %inc
+ ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*)
+
+
+define i64* @test_v2i64_post_imm_st4(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_st4:
+;CHECK: st4.2d { v0, v1, v2, v3 }, [x0], #64
+ call void @llvm.aarch64.neon.st4.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64* %A)
+ %tmp = getelementptr i64* %A, i64 8
+ ret i64* %tmp
+}
+
+define i64* @test_v2i64_post_reg_st4(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_st4:
+;CHECK: st4.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st4.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64* %A)
+ %tmp = getelementptr i64* %A, i64 %inc
+ ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>,<2 x i64>, i64*)
+
+
+define i64* @test_v1i64_post_imm_st4(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_st4:
+;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], #32
+ call void @llvm.aarch64.neon.st4.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64* %A)
+ %tmp = getelementptr i64* %A, i64 4
+ ret i64* %tmp
+}
+
+define i64* @test_v1i64_post_reg_st4(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_st4:
+;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st4.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64* %A)
+ %tmp = getelementptr i64* %A, i64 %inc
+ ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>,<1 x i64>, i64*)
+
+
+define float* @test_v4f32_post_imm_st4(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_st4:
+;CHECK: st4.4s { v0, v1, v2, v3 }, [x0], #64
+ call void @llvm.aarch64.neon.st4.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, float* %A)
+ %tmp = getelementptr float* %A, i32 16
+ ret float* %tmp
+}
+
+define float* @test_v4f32_post_reg_st4(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_st4:
+;CHECK: st4.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st4.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, float* %A)
+ %tmp = getelementptr float* %A, i64 %inc
+ ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, float*)
+
+
+define float* @test_v2f32_post_imm_st4(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_st4:
+;CHECK: st4.2s { v0, v1, v2, v3 }, [x0], #32
+ call void @llvm.aarch64.neon.st4.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, float* %A)
+ %tmp = getelementptr float* %A, i32 8
+ ret float* %tmp
+}
+
+define float* @test_v2f32_post_reg_st4(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_st4:
+;CHECK: st4.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st4.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, float* %A)
+ %tmp = getelementptr float* %A, i64 %inc
+ ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, float*)
+
+
+define double* @test_v2f64_post_imm_st4(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_st4:
+;CHECK: st4.2d { v0, v1, v2, v3 }, [x0], #64
+ call void @llvm.aarch64.neon.st4.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, double* %A)
+ %tmp = getelementptr double* %A, i64 8
+ ret double* %tmp
+}
+
+define double* @test_v2f64_post_reg_st4(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_st4:
+;CHECK: st4.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st4.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, double* %A)
+ %tmp = getelementptr double* %A, i64 %inc
+ ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>,<2 x double>, double*)
+
+
+define double* @test_v1f64_post_imm_st4(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_st4:
+;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], #32
+ call void @llvm.aarch64.neon.st4.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, double* %A)
+ %tmp = getelementptr double* %A, i64 4
+ ret double* %tmp
+}
+
+define double* @test_v1f64_post_reg_st4(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_st4:
+;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st4.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, double* %A)
+ %tmp = getelementptr double* %A, i64 %inc
+ ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, double*)
+
+
+define i8* @test_v16i8_post_imm_st1x2(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_st1x2:
+;CHECK: st1.16b { v0, v1 }, [x0], #32
+ call void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i8* %A)
+ %tmp = getelementptr i8* %A, i32 32
+ ret i8* %tmp
+}
+
+define i8* @test_v16i8_post_reg_st1x2(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_st1x2:
+;CHECK: st1.16b { v0, v1 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i8* %A)
+ %tmp = getelementptr i8* %A, i64 %inc
+ ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*)
+
+
+define i8* @test_v8i8_post_imm_st1x2(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_st1x2:
+;CHECK: st1.8b { v0, v1 }, [x0], #16
+ call void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i8* %A)
+ %tmp = getelementptr i8* %A, i32 16
+ ret i8* %tmp
+}
+
+define i8* @test_v8i8_post_reg_st1x2(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_st1x2:
+;CHECK: st1.8b { v0, v1 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i8* %A)
+ %tmp = getelementptr i8* %A, i64 %inc
+ ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*)
+
+
+define i16* @test_v8i16_post_imm_st1x2(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_st1x2:
+;CHECK: st1.8h { v0, v1 }, [x0], #32
+ call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i16* %A)
+ %tmp = getelementptr i16* %A, i32 16
+ ret i16* %tmp
+}
+
+define i16* @test_v8i16_post_reg_st1x2(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_st1x2:
+;CHECK: st1.8h { v0, v1 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i16* %A)
+ %tmp = getelementptr i16* %A, i64 %inc
+ ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*)
+
+
+define i16* @test_v4i16_post_imm_st1x2(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_st1x2:
+;CHECK: st1.4h { v0, v1 }, [x0], #16
+ call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i16* %A)
+ %tmp = getelementptr i16* %A, i32 8
+ ret i16* %tmp
+}
+
+define i16* @test_v4i16_post_reg_st1x2(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_st1x2:
+;CHECK: st1.4h { v0, v1 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i16* %A)
+ %tmp = getelementptr i16* %A, i64 %inc
+ ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*)
+
+
+define i32* @test_v4i32_post_imm_st1x2(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_st1x2:
+;CHECK: st1.4s { v0, v1 }, [x0], #32
+ call void @llvm.aarch64.neon.st1x2.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i32* %A)
+ %tmp = getelementptr i32* %A, i32 8
+ ret i32* %tmp
+}
+
+define i32* @test_v4i32_post_reg_st1x2(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_st1x2:
+;CHECK: st1.4s { v0, v1 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st1x2.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i32* %A)
+ %tmp = getelementptr i32* %A, i64 %inc
+ ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*)
+
+
+define i32* @test_v2i32_post_imm_st1x2(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_st1x2:
+;CHECK: st1.2s { v0, v1 }, [x0], #16
+ call void @llvm.aarch64.neon.st1x2.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i32* %A)
+ %tmp = getelementptr i32* %A, i32 4
+ ret i32* %tmp
+}
+
+define i32* @test_v2i32_post_reg_st1x2(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_st1x2:
+;CHECK: st1.2s { v0, v1 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st1x2.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i32* %A)
+ %tmp = getelementptr i32* %A, i64 %inc
+ ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*)
+
+
+define i64* @test_v2i64_post_imm_st1x2(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_st1x2:
+;CHECK: st1.2d { v0, v1 }, [x0], #32
+ call void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64* %A)
+ %tmp = getelementptr i64* %A, i64 4
+ ret i64* %tmp
+}
+
+define i64* @test_v2i64_post_reg_st1x2(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_st1x2:
+;CHECK: st1.2d { v0, v1 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64* %A)
+ %tmp = getelementptr i64* %A, i64 %inc
+ ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*)
+
+
+define i64* @test_v1i64_post_imm_st1x2(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_st1x2:
+;CHECK: st1.1d { v0, v1 }, [x0], #16
+ call void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64* %A)
+ %tmp = getelementptr i64* %A, i64 2
+ ret i64* %tmp
+}
+
+define i64* @test_v1i64_post_reg_st1x2(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_st1x2:
+;CHECK: st1.1d { v0, v1 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64* %A)
+ %tmp = getelementptr i64* %A, i64 %inc
+ ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*)
+
+
+define float* @test_v4f32_post_imm_st1x2(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_st1x2:
+;CHECK: st1.4s { v0, v1 }, [x0], #32
+ call void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float> %B, <4 x float> %C, float* %A)
+ %tmp = getelementptr float* %A, i32 8
+ ret float* %tmp
+}
+
+define float* @test_v4f32_post_reg_st1x2(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_st1x2:
+;CHECK: st1.4s { v0, v1 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float> %B, <4 x float> %C, float* %A)
+ %tmp = getelementptr float* %A, i64 %inc
+ ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float>, <4 x float>, float*)
+
+
+define float* @test_v2f32_post_imm_st1x2(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_st1x2:
+;CHECK: st1.2s { v0, v1 }, [x0], #16
+ call void @llvm.aarch64.neon.st1x2.v2f32.p0f32(<2 x float> %B, <2 x float> %C, float* %A)
+ %tmp = getelementptr float* %A, i32 4
+ ret float* %tmp
+}
+
+define float* @test_v2f32_post_reg_st1x2(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_st1x2:
+;CHECK: st1.2s { v0, v1 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st1x2.v2f32.p0f32(<2 x float> %B, <2 x float> %C, float* %A)
+ %tmp = getelementptr float* %A, i64 %inc
+ ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x2.v2f32.p0f32(<2 x float>, <2 x float>, float*)
+
+
+define double* @test_v2f64_post_imm_st1x2(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_st1x2:
+;CHECK: st1.2d { v0, v1 }, [x0], #32
+ call void @llvm.aarch64.neon.st1x2.v2f64.p0f64(<2 x double> %B, <2 x double> %C, double* %A)
+ %tmp = getelementptr double* %A, i64 4
+ ret double* %tmp
+}
+
+define double* @test_v2f64_post_reg_st1x2(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_st1x2:
+;CHECK: st1.2d { v0, v1 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st1x2.v2f64.p0f64(<2 x double> %B, <2 x double> %C, double* %A)
+ %tmp = getelementptr double* %A, i64 %inc
+ ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x2.v2f64.p0f64(<2 x double>, <2 x double>, double*)
+
+
+define double* @test_v1f64_post_imm_st1x2(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_st1x2:
+;CHECK: st1.1d { v0, v1 }, [x0], #16
+ call void @llvm.aarch64.neon.st1x2.v1f64.p0f64(<1 x double> %B, <1 x double> %C, double* %A)
+ %tmp = getelementptr double* %A, i64 2
+ ret double* %tmp
+}
+
+define double* @test_v1f64_post_reg_st1x2(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_st1x2:
+;CHECK: st1.1d { v0, v1 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st1x2.v1f64.p0f64(<1 x double> %B, <1 x double> %C, double* %A)
+ %tmp = getelementptr double* %A, i64 %inc
+ ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x2.v1f64.p0f64(<1 x double>, <1 x double>, double*)
+
+
+define i8* @test_v16i8_post_imm_st1x3(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_st1x3:
+;CHECK: st1.16b { v0, v1, v2 }, [x0], #48
+ call void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %A)
+ %tmp = getelementptr i8* %A, i32 48
+ ret i8* %tmp
+}
+
+define i8* @test_v16i8_post_reg_st1x3(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_st1x3:
+;CHECK: st1.16b { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %A)
+ %tmp = getelementptr i8* %A, i64 %inc
+ ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*)
+
+
+define i8* @test_v8i8_post_imm_st1x3(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_st1x3:
+;CHECK: st1.8b { v0, v1, v2 }, [x0], #24
+ call void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %A)
+ %tmp = getelementptr i8* %A, i32 24
+ ret i8* %tmp
+}
+
+define i8* @test_v8i8_post_reg_st1x3(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_st1x3:
+;CHECK: st1.8b { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %A)
+ %tmp = getelementptr i8* %A, i64 %inc
+ ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*)
+
+
+define i16* @test_v8i16_post_imm_st1x3(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_st1x3:
+;CHECK: st1.8h { v0, v1, v2 }, [x0], #48
+ call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %A)
+ %tmp = getelementptr i16* %A, i32 24
+ ret i16* %tmp
+}
+
+define i16* @test_v8i16_post_reg_st1x3(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_st1x3:
+;CHECK: st1.8h { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %A)
+ %tmp = getelementptr i16* %A, i64 %inc
+ ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*)
+
+
+define i16* @test_v4i16_post_imm_st1x3(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_st1x3:
+;CHECK: st1.4h { v0, v1, v2 }, [x0], #24
+ call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %A)
+ %tmp = getelementptr i16* %A, i32 12
+ ret i16* %tmp
+}
+
+define i16* @test_v4i16_post_reg_st1x3(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_st1x3:
+;CHECK: st1.4h { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %A)
+ %tmp = getelementptr i16* %A, i64 %inc
+ ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*)
+
+
+define i32* @test_v4i32_post_imm_st1x3(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_st1x3:
+;CHECK: st1.4s { v0, v1, v2 }, [x0], #48
+ call void @llvm.aarch64.neon.st1x3.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %A)
+ %tmp = getelementptr i32* %A, i32 12
+ ret i32* %tmp
+}
+
+define i32* @test_v4i32_post_reg_st1x3(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_st1x3:
+;CHECK: st1.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st1x3.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %A)
+ %tmp = getelementptr i32* %A, i64 %inc
+ ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*)
+
+
+define i32* @test_v2i32_post_imm_st1x3(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_st1x3:
+;CHECK: st1.2s { v0, v1, v2 }, [x0], #24
+ call void @llvm.aarch64.neon.st1x3.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %A)
+ %tmp = getelementptr i32* %A, i32 6
+ ret i32* %tmp
+}
+
+define i32* @test_v2i32_post_reg_st1x3(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_st1x3:
+;CHECK: st1.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st1x3.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %A)
+ %tmp = getelementptr i32* %A, i64 %inc
+ ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*)
+
+
+define i64* @test_v2i64_post_imm_st1x3(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_st1x3:
+;CHECK: st1.2d { v0, v1, v2 }, [x0], #48
+ call void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %A)
+ %tmp = getelementptr i64* %A, i64 6
+ ret i64* %tmp
+}
+
+define i64* @test_v2i64_post_reg_st1x3(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_st1x3:
+;CHECK: st1.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %A)
+ %tmp = getelementptr i64* %A, i64 %inc
+ ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*)
+
+
+define i64* @test_v1i64_post_imm_st1x3(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_st1x3:
+;CHECK: st1.1d { v0, v1, v2 }, [x0], #24
+ call void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %A)
+ %tmp = getelementptr i64* %A, i64 3
+ ret i64* %tmp
+}
+
+define i64* @test_v1i64_post_reg_st1x3(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_st1x3:
+;CHECK: st1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %A)
+ %tmp = getelementptr i64* %A, i64 %inc
+ ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*)
+
+
+define float* @test_v4f32_post_imm_st1x3(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_st1x3:
+;CHECK: st1.4s { v0, v1, v2 }, [x0], #48
+ call void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, float* %A)
+ %tmp = getelementptr float* %A, i32 12
+ ret float* %tmp
+}
+
+define float* @test_v4f32_post_reg_st1x3(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_st1x3:
+;CHECK: st1.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, float* %A)
+ %tmp = getelementptr float* %A, i64 %inc
+ ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, float*)
+
+
+define float* @test_v2f32_post_imm_st1x3(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_st1x3:
+;CHECK: st1.2s { v0, v1, v2 }, [x0], #24
+ call void @llvm.aarch64.neon.st1x3.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, float* %A)
+ %tmp = getelementptr float* %A, i32 6
+ ret float* %tmp
+}
+
+define float* @test_v2f32_post_reg_st1x3(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_st1x3:
+;CHECK: st1.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st1x3.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, float* %A)
+ %tmp = getelementptr float* %A, i64 %inc
+ ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x3.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, float*)
+
+
+define double* @test_v2f64_post_imm_st1x3(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_st1x3:
+;CHECK: st1.2d { v0, v1, v2 }, [x0], #48
+ call void @llvm.aarch64.neon.st1x3.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, double* %A)
+ %tmp = getelementptr double* %A, i64 6
+ ret double* %tmp
+}
+
+define double* @test_v2f64_post_reg_st1x3(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_st1x3:
+;CHECK: st1.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st1x3.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, double* %A)
+ %tmp = getelementptr double* %A, i64 %inc
+ ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x3.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, double*)
+
+
+define double* @test_v1f64_post_imm_st1x3(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_st1x3:
+;CHECK: st1.1d { v0, v1, v2 }, [x0], #24
+ call void @llvm.aarch64.neon.st1x3.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, double* %A)
+ %tmp = getelementptr double* %A, i64 3
+ ret double* %tmp
+}
+
+define double* @test_v1f64_post_reg_st1x3(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_st1x3:
+;CHECK: st1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st1x3.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, double* %A)
+ %tmp = getelementptr double* %A, i64 %inc
+ ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x3.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, double*)
+
+
+define i8* @test_v16i8_post_imm_st1x4(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_st1x4:
+;CHECK: st1.16b { v0, v1, v2, v3 }, [x0], #64
+ call void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i8* %A)
+ %tmp = getelementptr i8* %A, i32 64
+ ret i8* %tmp
+}
+
+define i8* @test_v16i8_post_reg_st1x4(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_st1x4:
+;CHECK: st1.16b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i8* %A)
+ %tmp = getelementptr i8* %A, i64 %inc
+ ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*)
+
+
+define i8* @test_v8i8_post_imm_st1x4(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_st1x4:
+;CHECK: st1.8b { v0, v1, v2, v3 }, [x0], #32
+ call void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i8* %A)
+ %tmp = getelementptr i8* %A, i32 32
+ ret i8* %tmp
+}
+
+define i8* @test_v8i8_post_reg_st1x4(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_st1x4:
+;CHECK: st1.8b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i8* %A)
+ %tmp = getelementptr i8* %A, i64 %inc
+ ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*)
+
+
+define i16* @test_v8i16_post_imm_st1x4(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_st1x4:
+;CHECK: st1.8h { v0, v1, v2, v3 }, [x0], #64
+ call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i16* %A)
+ %tmp = getelementptr i16* %A, i32 32
+ ret i16* %tmp
+}
+
+define i16* @test_v8i16_post_reg_st1x4(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_st1x4:
+;CHECK: st1.8h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i16* %A)
+ %tmp = getelementptr i16* %A, i64 %inc
+ ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*)
+
+
+define i16* @test_v4i16_post_imm_st1x4(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_st1x4:
+;CHECK: st1.4h { v0, v1, v2, v3 }, [x0], #32
+ call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i16* %A)
+ %tmp = getelementptr i16* %A, i32 16
+ ret i16* %tmp
+}
+
+define i16* @test_v4i16_post_reg_st1x4(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_st1x4:
+;CHECK: st1.4h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i16* %A)
+ %tmp = getelementptr i16* %A, i64 %inc
+ ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>,<4 x i16>, i16*)
+
+
+define i32* @test_v4i32_post_imm_st1x4(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_st1x4:
+;CHECK: st1.4s { v0, v1, v2, v3 }, [x0], #64
+ call void @llvm.aarch64.neon.st1x4.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i32* %A)
+ %tmp = getelementptr i32* %A, i32 16
+ ret i32* %tmp
+}
+
+define i32* @test_v4i32_post_reg_st1x4(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_st1x4:
+;CHECK: st1.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st1x4.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i32* %A)
+ %tmp = getelementptr i32* %A, i64 %inc
+ ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>,<4 x i32>, i32*)
+
+
+define i32* @test_v2i32_post_imm_st1x4(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_st1x4:
+;CHECK: st1.2s { v0, v1, v2, v3 }, [x0], #32
+ call void @llvm.aarch64.neon.st1x4.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i32* %A)
+ %tmp = getelementptr i32* %A, i32 8
+ ret i32* %tmp
+}
+
+define i32* @test_v2i32_post_reg_st1x4(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_st1x4:
+;CHECK: st1.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st1x4.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i32* %A)
+ %tmp = getelementptr i32* %A, i64 %inc
+ ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*)
+
+
+define i64* @test_v2i64_post_imm_st1x4(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_st1x4:
+;CHECK: st1.2d { v0, v1, v2, v3 }, [x0], #64
+ call void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64* %A)
+ %tmp = getelementptr i64* %A, i64 8
+ ret i64* %tmp
+}
+
+define i64* @test_v2i64_post_reg_st1x4(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_st1x4:
+;CHECK: st1.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64* %A)
+ %tmp = getelementptr i64* %A, i64 %inc
+ ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>,<2 x i64>, i64*)
+
+
+define i64* @test_v1i64_post_imm_st1x4(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_st1x4:
+;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], #32
+ call void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64* %A)
+ %tmp = getelementptr i64* %A, i64 4
+ ret i64* %tmp
+}
+
+define i64* @test_v1i64_post_reg_st1x4(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_st1x4:
+;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64* %A)
+ %tmp = getelementptr i64* %A, i64 %inc
+ ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>,<1 x i64>, i64*)
+
+
+define float* @test_v4f32_post_imm_st1x4(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_st1x4:
+;CHECK: st1.4s { v0, v1, v2, v3 }, [x0], #64
+ call void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, float* %A)
+ %tmp = getelementptr float* %A, i32 16
+ ret float* %tmp
+}
+
+define float* @test_v4f32_post_reg_st1x4(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_st1x4:
+;CHECK: st1.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, float* %A)
+ %tmp = getelementptr float* %A, i64 %inc
+ ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, float*)
+
+
+define float* @test_v2f32_post_imm_st1x4(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_st1x4:
+;CHECK: st1.2s { v0, v1, v2, v3 }, [x0], #32
+ call void @llvm.aarch64.neon.st1x4.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, float* %A)
+ %tmp = getelementptr float* %A, i32 8
+ ret float* %tmp
+}
+
+define float* @test_v2f32_post_reg_st1x4(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_st1x4:
+;CHECK: st1.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st1x4.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, float* %A)
+ %tmp = getelementptr float* %A, i64 %inc
+ ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x4.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, float*)
+
+
+define double* @test_v2f64_post_imm_st1x4(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_st1x4:
+;CHECK: st1.2d { v0, v1, v2, v3 }, [x0], #64
+ call void @llvm.aarch64.neon.st1x4.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, double* %A)
+ %tmp = getelementptr double* %A, i64 8
+ ret double* %tmp
+}
+
+define double* @test_v2f64_post_reg_st1x4(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_st1x4:
+;CHECK: st1.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st1x4.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, double* %A)
+ %tmp = getelementptr double* %A, i64 %inc
+ ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x4.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>,<2 x double>, double*)
+
+
+define double* @test_v1f64_post_imm_st1x4(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_st1x4:
+;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], #32
+ call void @llvm.aarch64.neon.st1x4.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, double* %A)
+ %tmp = getelementptr double* %A, i64 4
+ ret double* %tmp
+}
+
+define double* @test_v1f64_post_reg_st1x4(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_st1x4:
+;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st1x4.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, double* %A)
+ %tmp = getelementptr double* %A, i64 %inc
+ ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st1x4.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, double*)
+
+
+define i8* @test_v16i8_post_imm_st2lanelane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C) {
+ call void @llvm.aarch64.neon.st2lanelane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i64 1, i8* %A)
+ %tmp = getelementptr i8* %A, i32 2
+ ret i8* %tmp
+}
+
+define i8* @test_v16i8_post_reg_st2lanelane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, i64 %inc) {
+ call void @llvm.aarch64.neon.st2lanelane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i64 1, i8* %A)
+ %tmp = getelementptr i8* %A, i64 %inc
+ ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lanelane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i64, i8*) nounwind readnone
+
+
+define i8* @test_v16i8_post_imm_st2lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_st2lane:
+;CHECK: st2.b { v0, v1 }[0], [x0], #2
+ call void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i8* %A)
+ %tmp = getelementptr i8* %A, i32 2
+ ret i8* %tmp
+}
+
+define i8* @test_v16i8_post_reg_st2lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_st2lane:
+;CHECK: st2.b { v0, v1 }[0], [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i8* %A)
+ %tmp = getelementptr i8* %A, i64 %inc
+ ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*)
+
+
+define i8* @test_v8i8_post_imm_st2lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_st2lane:
+;CHECK: st2.b { v0, v1 }[0], [x0], #2
+ call void @llvm.aarch64.neon.st2lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i64 0, i8* %A)
+ %tmp = getelementptr i8* %A, i32 2
+ ret i8* %tmp
+}
+
+define i8* @test_v8i8_post_reg_st2lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_st2lane:
+;CHECK: st2.b { v0, v1 }[0], [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st2lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i64 0, i8* %A)
+ %tmp = getelementptr i8* %A, i64 %inc
+ ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lane.v8i8.p0i8(<8 x i8>, <8 x i8>, i64, i8*)
+
+
+define i16* @test_v8i16_post_imm_st2lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_st2lane:
+;CHECK: st2.h { v0, v1 }[0], [x0], #4
+ call void @llvm.aarch64.neon.st2lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i64 0, i16* %A)
+ %tmp = getelementptr i16* %A, i32 2
+ ret i16* %tmp
+}
+
+define i16* @test_v8i16_post_reg_st2lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_st2lane:
+;CHECK: st2.h { v0, v1 }[0], [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st2lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i64 0, i16* %A)
+ %tmp = getelementptr i16* %A, i64 %inc
+ ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*)
+
+
+define i16* @test_v4i16_post_imm_st2lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_st2lane:
+;CHECK: st2.h { v0, v1 }[0], [x0], #4
+ call void @llvm.aarch64.neon.st2lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i64 0, i16* %A)
+ %tmp = getelementptr i16* %A, i32 2
+ ret i16* %tmp
+}
+
+define i16* @test_v4i16_post_reg_st2lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_st2lane:
+;CHECK: st2.h { v0, v1 }[0], [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st2lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i64 0, i16* %A)
+ %tmp = getelementptr i16* %A, i64 %inc
+ ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lane.v4i16.p0i16(<4 x i16>, <4 x i16>, i64, i16*)
+
+
+define i32* @test_v4i32_post_imm_st2lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_st2lane:
+;CHECK: st2.s { v0, v1 }[0], [x0], #8
+ call void @llvm.aarch64.neon.st2lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i64 0, i32* %A)
+ %tmp = getelementptr i32* %A, i32 2
+ ret i32* %tmp
+}
+
+define i32* @test_v4i32_post_reg_st2lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_st2lane:
+;CHECK: st2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st2lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i64 0, i32* %A)
+ %tmp = getelementptr i32* %A, i64 %inc
+ ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*)
+
+
+define i32* @test_v2i32_post_imm_st2lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_st2lane:
+;CHECK: st2.s { v0, v1 }[0], [x0], #8
+ call void @llvm.aarch64.neon.st2lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i64 0, i32* %A)
+ %tmp = getelementptr i32* %A, i32 2
+ ret i32* %tmp
+}
+
+define i32* @test_v2i32_post_reg_st2lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_st2lane:
+;CHECK: st2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st2lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i64 0, i32* %A)
+ %tmp = getelementptr i32* %A, i64 %inc
+ ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lane.v2i32.p0i32(<2 x i32>, <2 x i32>, i64, i32*)
+
+
+define i64* @test_v2i64_post_imm_st2lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_st2lane:
+;CHECK: st2.d { v0, v1 }[0], [x0], #16
+ call void @llvm.aarch64.neon.st2lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64 0, i64* %A)
+ %tmp = getelementptr i64* %A, i64 2
+ ret i64* %tmp
+}
+
+define i64* @test_v2i64_post_reg_st2lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_st2lane:
+;CHECK: st2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st2lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64 0, i64* %A)
+ %tmp = getelementptr i64* %A, i64 %inc
+ ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*)
+
+
+define i64* @test_v1i64_post_imm_st2lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_st2lane:
+;CHECK: st2.d { v0, v1 }[0], [x0], #16
+ call void @llvm.aarch64.neon.st2lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64 0, i64* %A)
+ %tmp = getelementptr i64* %A, i64 2
+ ret i64* %tmp
+}
+
+define i64* @test_v1i64_post_reg_st2lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_st2lane:
+;CHECK: st2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st2lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64 0, i64* %A)
+ %tmp = getelementptr i64* %A, i64 %inc
+ ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lane.v1i64.p0i64(<1 x i64>, <1 x i64>, i64, i64*)
+
+
+define float* @test_v4f32_post_imm_st2lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_st2lane:
+;CHECK: st2.s { v0, v1 }[0], [x0], #8
+ call void @llvm.aarch64.neon.st2lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, i64 0, float* %A)
+ %tmp = getelementptr float* %A, i32 2
+ ret float* %tmp
+}
+
+define float* @test_v4f32_post_reg_st2lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_st2lane:
+;CHECK: st2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st2lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, i64 0, float* %A)
+ %tmp = getelementptr float* %A, i64 %inc
+ ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lane.v4f32.p0f32(<4 x float>, <4 x float>, i64, float*)
+
+
+define float* @test_v2f32_post_imm_st2lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_st2lane:
+;CHECK: st2.s { v0, v1 }[0], [x0], #8
+ call void @llvm.aarch64.neon.st2lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, i64 0, float* %A)
+ %tmp = getelementptr float* %A, i32 2
+ ret float* %tmp
+}
+
+define float* @test_v2f32_post_reg_st2lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_st2lane:
+;CHECK: st2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st2lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, i64 0, float* %A)
+ %tmp = getelementptr float* %A, i64 %inc
+ ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lane.v2f32.p0f32(<2 x float>, <2 x float>, i64, float*)
+
+
+define double* @test_v2f64_post_imm_st2lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_st2lane:
+;CHECK: st2.d { v0, v1 }[0], [x0], #16
+ call void @llvm.aarch64.neon.st2lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, i64 0, double* %A)
+ %tmp = getelementptr double* %A, i64 2
+ ret double* %tmp
+}
+
+define double* @test_v2f64_post_reg_st2lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_st2lane:
+;CHECK: st2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st2lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, i64 0, double* %A)
+ %tmp = getelementptr double* %A, i64 %inc
+ ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lane.v2f64.p0f64(<2 x double>, <2 x double>, i64, double*)
+
+
+define double* @test_v1f64_post_imm_st2lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_st2lane:
+;CHECK: st2.d { v0, v1 }[0], [x0], #16
+ call void @llvm.aarch64.neon.st2lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, i64 0, double* %A)
+ %tmp = getelementptr double* %A, i64 2
+ ret double* %tmp
+}
+
+define double* @test_v1f64_post_reg_st2lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_st2lane:
+;CHECK: st2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st2lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, i64 0, double* %A)
+ %tmp = getelementptr double* %A, i64 %inc
+ ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st2lane.v1f64.p0f64(<1 x double>, <1 x double>, i64, double*)
+
+
+define i8* @test_v16i8_post_imm_st3lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_st3lane:
+;CHECK: st3.b { v0, v1, v2 }[0], [x0], #3
+ call void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 0, i8* %A)
+ %tmp = getelementptr i8* %A, i32 3
+ ret i8* %tmp
+}
+
+define i8* @test_v16i8_post_reg_st3lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_st3lane:
+;CHECK: st3.b { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 0, i8* %A)
+ %tmp = getelementptr i8* %A, i64 %inc
+ ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*)
+
+
+define i8* @test_v8i8_post_imm_st3lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_st3lane:
+;CHECK: st3.b { v0, v1, v2 }[0], [x0], #3
+ call void @llvm.aarch64.neon.st3lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 0, i8* %A)
+ %tmp = getelementptr i8* %A, i32 3
+ ret i8* %tmp
+}
+
+define i8* @test_v8i8_post_reg_st3lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_st3lane:
+;CHECK: st3.b { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st3lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 0, i8* %A)
+ %tmp = getelementptr i8* %A, i64 %inc
+ ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3lane.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i64, i8*)
+
+
+define i16* @test_v8i16_post_imm_st3lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_st3lane:
+;CHECK: st3.h { v0, v1, v2 }[0], [x0], #6
+ call void @llvm.aarch64.neon.st3lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 0, i16* %A)
+ %tmp = getelementptr i16* %A, i32 3
+ ret i16* %tmp
+}
+
+define i16* @test_v8i16_post_reg_st3lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_st3lane:
+;CHECK: st3.h { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st3lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 0, i16* %A)
+ %tmp = getelementptr i16* %A, i64 %inc
+ ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*)
+
+
+define i16* @test_v4i16_post_imm_st3lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_st3lane:
+;CHECK: st3.h { v0, v1, v2 }[0], [x0], #6
+ call void @llvm.aarch64.neon.st3lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 0, i16* %A)
+ %tmp = getelementptr i16* %A, i32 3
+ ret i16* %tmp
+}
+
+define i16* @test_v4i16_post_reg_st3lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_st3lane:
+;CHECK: st3.h { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st3lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 0, i16* %A)
+ %tmp = getelementptr i16* %A, i64 %inc
+ ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3lane.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i64, i16*)
+
+
+define i32* @test_v4i32_post_imm_st3lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_st3lane:
+;CHECK: st3.s { v0, v1, v2 }[0], [x0], #12
+ call void @llvm.aarch64.neon.st3lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 0, i32* %A)
+ %tmp = getelementptr i32* %A, i32 3
+ ret i32* %tmp
+}
+
+define i32* @test_v4i32_post_reg_st3lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_st3lane:
+;CHECK: st3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st3lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 0, i32* %A)
+ %tmp = getelementptr i32* %A, i64 %inc
+ ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*)
+
+
+define i32* @test_v2i32_post_imm_st3lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_st3lane:
+;CHECK: st3.s { v0, v1, v2 }[0], [x0], #12
+ call void @llvm.aarch64.neon.st3lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 0, i32* %A)
+ %tmp = getelementptr i32* %A, i32 3
+ ret i32* %tmp
+}
+
+define i32* @test_v2i32_post_reg_st3lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_st3lane:
+;CHECK: st3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st3lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 0, i32* %A)
+ %tmp = getelementptr i32* %A, i64 %inc
+ ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3lane.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i64, i32*)
+
+
+define i64* @test_v2i64_post_imm_st3lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_st3lane:
+;CHECK: st3.d { v0, v1, v2 }[0], [x0], #24
+ call void @llvm.aarch64.neon.st3lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 0, i64* %A)
+ %tmp = getelementptr i64* %A, i64 3
+ ret i64* %tmp
+}
+
+define i64* @test_v2i64_post_reg_st3lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_st3lane:
+;CHECK: st3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st3lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 0, i64* %A)
+ %tmp = getelementptr i64* %A, i64 %inc
+ ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*)
+
+
+define i64* @test_v1i64_post_imm_st3lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_st3lane:
+;CHECK: st3.d { v0, v1, v2 }[0], [x0], #24
+ call void @llvm.aarch64.neon.st3lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 0, i64* %A)
+ %tmp = getelementptr i64* %A, i64 3
+ ret i64* %tmp
+}
+
+define i64* @test_v1i64_post_reg_st3lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_st3lane:
+;CHECK: st3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st3lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 0, i64* %A)
+ %tmp = getelementptr i64* %A, i64 %inc
+ ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64, i64*)
+
+
+define float* @test_v4f32_post_imm_st3lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_st3lane:
+;CHECK: st3.s { v0, v1, v2 }[0], [x0], #12
+ call void @llvm.aarch64.neon.st3lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, i64 0, float* %A)
+ %tmp = getelementptr float* %A, i32 3
+ ret float* %tmp
+}
+
+define float* @test_v4f32_post_reg_st3lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_st3lane:
+;CHECK: st3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st3lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, i64 0, float* %A)
+ %tmp = getelementptr float* %A, i64 %inc
+ ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, i64, float*)
+
+
+define float* @test_v2f32_post_imm_st3lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_st3lane:
+;CHECK: st3.s { v0, v1, v2 }[0], [x0], #12
+ call void @llvm.aarch64.neon.st3lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, i64 0, float* %A)
+ %tmp = getelementptr float* %A, i32 3
+ ret float* %tmp
+}
+
+define float* @test_v2f32_post_reg_st3lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_st3lane:
+;CHECK: st3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st3lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, i64 0, float* %A)
+ %tmp = getelementptr float* %A, i64 %inc
+ ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, i64, float*)
+
+
+define double* @test_v2f64_post_imm_st3lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_st3lane:
+;CHECK: st3.d { v0, v1, v2 }[0], [x0], #24
+ call void @llvm.aarch64.neon.st3lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, i64 0, double* %A)
+ %tmp = getelementptr double* %A, i64 3
+ ret double* %tmp
+}
+
+define double* @test_v2f64_post_reg_st3lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_st3lane:
+;CHECK: st3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st3lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, i64 0, double* %A)
+ %tmp = getelementptr double* %A, i64 %inc
+ ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3lane.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, i64, double*)
+
+
+define double* @test_v1f64_post_imm_st3lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_st3lane:
+;CHECK: st3.d { v0, v1, v2 }[0], [x0], #24
+ call void @llvm.aarch64.neon.st3lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, i64 0, double* %A)
+ %tmp = getelementptr double* %A, i64 3
+ ret double* %tmp
+}
+
+define double* @test_v1f64_post_reg_st3lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_st3lane:
+;CHECK: st3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st3lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, i64 0, double* %A)
+ %tmp = getelementptr double* %A, i64 %inc
+ ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st3lane.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, i64, double*)
+
+
+define i8* @test_v16i8_post_imm_st4lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) nounwind {
+;CHECK-LABEL: test_v16i8_post_imm_st4lane:
+;CHECK: st4.b { v0, v1, v2, v3 }[0], [x0], #4
+ call void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 0, i8* %A)
+ %tmp = getelementptr i8* %A, i32 4
+ ret i8* %tmp
+}
+
+define i8* @test_v16i8_post_reg_st4lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v16i8_post_reg_st4lane:
+;CHECK: st4.b { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 0, i8* %A)
+ %tmp = getelementptr i8* %A, i64 %inc
+ ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*)
+
+
+define i8* @test_v8i8_post_imm_st4lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E) nounwind {
+;CHECK-LABEL: test_v8i8_post_imm_st4lane:
+;CHECK: st4.b { v0, v1, v2, v3 }[0], [x0], #4
+ call void @llvm.aarch64.neon.st4lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 0, i8* %A)
+ %tmp = getelementptr i8* %A, i32 4
+ ret i8* %tmp
+}
+
+define i8* @test_v8i8_post_reg_st4lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i8_post_reg_st4lane:
+;CHECK: st4.b { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st4lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 0, i8* %A)
+ %tmp = getelementptr i8* %A, i64 %inc
+ ret i8* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4lane.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i64, i8*)
+
+
+define i16* @test_v8i16_post_imm_st4lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E) nounwind {
+;CHECK-LABEL: test_v8i16_post_imm_st4lane:
+;CHECK: st4.h { v0, v1, v2, v3 }[0], [x0], #8
+ call void @llvm.aarch64.neon.st4lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 0, i16* %A)
+ %tmp = getelementptr i16* %A, i32 4
+ ret i16* %tmp
+}
+
+define i16* @test_v8i16_post_reg_st4lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v8i16_post_reg_st4lane:
+;CHECK: st4.h { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st4lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 0, i16* %A)
+ %tmp = getelementptr i16* %A, i64 %inc
+ ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*)
+
+
+define i16* @test_v4i16_post_imm_st4lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E) nounwind {
+;CHECK-LABEL: test_v4i16_post_imm_st4lane:
+;CHECK: st4.h { v0, v1, v2, v3 }[0], [x0], #8
+ call void @llvm.aarch64.neon.st4lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 0, i16* %A)
+ %tmp = getelementptr i16* %A, i32 4
+ ret i16* %tmp
+}
+
+define i16* @test_v4i16_post_reg_st4lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i16_post_reg_st4lane:
+;CHECK: st4.h { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st4lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 0, i16* %A)
+ %tmp = getelementptr i16* %A, i64 %inc
+ ret i16* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4lane.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i64, i16*)
+
+
+define i32* @test_v4i32_post_imm_st4lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E) nounwind {
+;CHECK-LABEL: test_v4i32_post_imm_st4lane:
+;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], #16
+ call void @llvm.aarch64.neon.st4lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 0, i32* %A)
+ %tmp = getelementptr i32* %A, i32 4
+ ret i32* %tmp
+}
+
+define i32* @test_v4i32_post_reg_st4lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4i32_post_reg_st4lane:
+;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st4lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 0, i32* %A)
+ %tmp = getelementptr i32* %A, i64 %inc
+ ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*)
+
+
+define i32* @test_v2i32_post_imm_st4lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E) nounwind {
+;CHECK-LABEL: test_v2i32_post_imm_st4lane:
+;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], #16
+ call void @llvm.aarch64.neon.st4lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 0, i32* %A)
+ %tmp = getelementptr i32* %A, i32 4
+ ret i32* %tmp
+}
+
+define i32* @test_v2i32_post_reg_st4lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i32_post_reg_st4lane:
+;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st4lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 0, i32* %A)
+ %tmp = getelementptr i32* %A, i64 %inc
+ ret i32* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4lane.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i64, i32*)
+
+
+define i64* @test_v2i64_post_imm_st4lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E) nounwind {
+;CHECK-LABEL: test_v2i64_post_imm_st4lane:
+;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], #32
+ call void @llvm.aarch64.neon.st4lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 0, i64* %A)
+ %tmp = getelementptr i64* %A, i64 4
+ ret i64* %tmp
+}
+
+define i64* @test_v2i64_post_reg_st4lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2i64_post_reg_st4lane:
+;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st4lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 0, i64* %A)
+ %tmp = getelementptr i64* %A, i64 %inc
+ ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*)
+
+
+define i64* @test_v1i64_post_imm_st4lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E) nounwind {
+;CHECK-LABEL: test_v1i64_post_imm_st4lane:
+;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], #32
+ call void @llvm.aarch64.neon.st4lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 0, i64* %A)
+ %tmp = getelementptr i64* %A, i64 4
+ ret i64* %tmp
+}
+
+define i64* @test_v1i64_post_reg_st4lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1i64_post_reg_st4lane:
+;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st4lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 0, i64* %A)
+ %tmp = getelementptr i64* %A, i64 %inc
+ ret i64* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64, i64*)
+
+
+define float* @test_v4f32_post_imm_st4lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E) nounwind {
+;CHECK-LABEL: test_v4f32_post_imm_st4lane:
+;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], #16
+ call void @llvm.aarch64.neon.st4lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 0, float* %A)
+ %tmp = getelementptr float* %A, i32 4
+ ret float* %tmp
+}
+
+define float* @test_v4f32_post_reg_st4lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v4f32_post_reg_st4lane:
+;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st4lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 0, float* %A)
+ %tmp = getelementptr float* %A, i64 %inc
+ ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, i64, float*)
+
+
+define float* @test_v2f32_post_imm_st4lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E) nounwind {
+;CHECK-LABEL: test_v2f32_post_imm_st4lane:
+;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], #16
+ call void @llvm.aarch64.neon.st4lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 0, float* %A)
+ %tmp = getelementptr float* %A, i32 4
+ ret float* %tmp
+}
+
+define float* @test_v2f32_post_reg_st4lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f32_post_reg_st4lane:
+;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st4lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 0, float* %A)
+ %tmp = getelementptr float* %A, i64 %inc
+ ret float* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, i64, float*)
+
+
+define double* @test_v2f64_post_imm_st4lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E) nounwind {
+;CHECK-LABEL: test_v2f64_post_imm_st4lane:
+;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], #32
+ call void @llvm.aarch64.neon.st4lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 0, double* %A)
+ %tmp = getelementptr double* %A, i64 4
+ ret double* %tmp
+}
+
+define double* @test_v2f64_post_reg_st4lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v2f64_post_reg_st4lane:
+;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st4lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 0, double* %A)
+ %tmp = getelementptr double* %A, i64 %inc
+ ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4lane.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, <2 x double>, i64, double*)
+
+
+define double* @test_v1f64_post_imm_st4lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E) nounwind {
+;CHECK-LABEL: test_v1f64_post_imm_st4lane:
+;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], #32
+ call void @llvm.aarch64.neon.st4lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 0, double* %A)
+ %tmp = getelementptr double* %A, i64 4
+ ret double* %tmp
+}
+
+define double* @test_v1f64_post_reg_st4lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 %inc) nounwind {
+;CHECK-LABEL: test_v1f64_post_reg_st4lane:
+;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
+ call void @llvm.aarch64.neon.st4lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 0, double* %A)
+ %tmp = getelementptr double* %A, i64 %inc
+ ret double* %tmp
+}
+
+declare void @llvm.aarch64.neon.st4lane.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, i64, double*)
+
+define <16 x i8> @test_v16i8_post_imm_ld1r(i8* %bar, i8** %ptr) {
+; CHECK-LABEL: test_v16i8_post_imm_ld1r:
+; CHECK: ld1r.16b { v0 }, [x0], #1
+ %tmp1 = load i8* %bar
+ %tmp2 = insertelement <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmp1, i32 0
+ %tmp3 = insertelement <16 x i8> %tmp2, i8 %tmp1, i32 1
+ %tmp4 = insertelement <16 x i8> %tmp3, i8 %tmp1, i32 2
+ %tmp5 = insertelement <16 x i8> %tmp4, i8 %tmp1, i32 3
+ %tmp6 = insertelement <16 x i8> %tmp5, i8 %tmp1, i32 4
+ %tmp7 = insertelement <16 x i8> %tmp6, i8 %tmp1, i32 5
+ %tmp8 = insertelement <16 x i8> %tmp7, i8 %tmp1, i32 6
+ %tmp9 = insertelement <16 x i8> %tmp8, i8 %tmp1, i32 7
+ %tmp10 = insertelement <16 x i8> %tmp9, i8 %tmp1, i32 8
+ %tmp11 = insertelement <16 x i8> %tmp10, i8 %tmp1, i32 9
+ %tmp12 = insertelement <16 x i8> %tmp11, i8 %tmp1, i32 10
+ %tmp13 = insertelement <16 x i8> %tmp12, i8 %tmp1, i32 11
+ %tmp14 = insertelement <16 x i8> %tmp13, i8 %tmp1, i32 12
+ %tmp15 = insertelement <16 x i8> %tmp14, i8 %tmp1, i32 13
+ %tmp16 = insertelement <16 x i8> %tmp15, i8 %tmp1, i32 14
+ %tmp17 = insertelement <16 x i8> %tmp16, i8 %tmp1, i32 15
+ %tmp18 = getelementptr i8* %bar, i64 1
+ store i8* %tmp18, i8** %ptr
+ ret <16 x i8> %tmp17
+}
+
+define <16 x i8> @test_v16i8_post_reg_ld1r(i8* %bar, i8** %ptr, i64 %inc) {
+; CHECK-LABEL: test_v16i8_post_reg_ld1r:
+; CHECK: ld1r.16b { v0 }, [x0], x{{[0-9]+}}
+ %tmp1 = load i8* %bar
+ %tmp2 = insertelement <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmp1, i32 0
+ %tmp3 = insertelement <16 x i8> %tmp2, i8 %tmp1, i32 1
+ %tmp4 = insertelement <16 x i8> %tmp3, i8 %tmp1, i32 2
+ %tmp5 = insertelement <16 x i8> %tmp4, i8 %tmp1, i32 3
+ %tmp6 = insertelement <16 x i8> %tmp5, i8 %tmp1, i32 4
+ %tmp7 = insertelement <16 x i8> %tmp6, i8 %tmp1, i32 5
+ %tmp8 = insertelement <16 x i8> %tmp7, i8 %tmp1, i32 6
+ %tmp9 = insertelement <16 x i8> %tmp8, i8 %tmp1, i32 7
+ %tmp10 = insertelement <16 x i8> %tmp9, i8 %tmp1, i32 8
+ %tmp11 = insertelement <16 x i8> %tmp10, i8 %tmp1, i32 9
+ %tmp12 = insertelement <16 x i8> %tmp11, i8 %tmp1, i32 10
+ %tmp13 = insertelement <16 x i8> %tmp12, i8 %tmp1, i32 11
+ %tmp14 = insertelement <16 x i8> %tmp13, i8 %tmp1, i32 12
+ %tmp15 = insertelement <16 x i8> %tmp14, i8 %tmp1, i32 13
+ %tmp16 = insertelement <16 x i8> %tmp15, i8 %tmp1, i32 14
+ %tmp17 = insertelement <16 x i8> %tmp16, i8 %tmp1, i32 15
+ %tmp18 = getelementptr i8* %bar, i64 %inc
+ store i8* %tmp18, i8** %ptr
+ ret <16 x i8> %tmp17
+}
+
+define <8 x i8> @test_v8i8_post_imm_ld1r(i8* %bar, i8** %ptr) {
+; CHECK-LABEL: test_v8i8_post_imm_ld1r:
+; CHECK: ld1r.8b { v0 }, [x0], #1
+ %tmp1 = load i8* %bar
+ %tmp2 = insertelement <8 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmp1, i32 0
+ %tmp3 = insertelement <8 x i8> %tmp2, i8 %tmp1, i32 1
+ %tmp4 = insertelement <8 x i8> %tmp3, i8 %tmp1, i32 2
+ %tmp5 = insertelement <8 x i8> %tmp4, i8 %tmp1, i32 3
+ %tmp6 = insertelement <8 x i8> %tmp5, i8 %tmp1, i32 4
+ %tmp7 = insertelement <8 x i8> %tmp6, i8 %tmp1, i32 5
+ %tmp8 = insertelement <8 x i8> %tmp7, i8 %tmp1, i32 6
+ %tmp9 = insertelement <8 x i8> %tmp8, i8 %tmp1, i32 7
+ %tmp10 = getelementptr i8* %bar, i64 1
+ store i8* %tmp10, i8** %ptr
+ ret <8 x i8> %tmp9
+}
+
+define <8 x i8> @test_v8i8_post_reg_ld1r(i8* %bar, i8** %ptr, i64 %inc) {
+; CHECK-LABEL: test_v8i8_post_reg_ld1r:
+; CHECK: ld1r.8b { v0 }, [x0], x{{[0-9]+}}
+ %tmp1 = load i8* %bar
+ %tmp2 = insertelement <8 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmp1, i32 0
+ %tmp3 = insertelement <8 x i8> %tmp2, i8 %tmp1, i32 1
+ %tmp4 = insertelement <8 x i8> %tmp3, i8 %tmp1, i32 2
+ %tmp5 = insertelement <8 x i8> %tmp4, i8 %tmp1, i32 3
+ %tmp6 = insertelement <8 x i8> %tmp5, i8 %tmp1, i32 4
+ %tmp7 = insertelement <8 x i8> %tmp6, i8 %tmp1, i32 5
+ %tmp8 = insertelement <8 x i8> %tmp7, i8 %tmp1, i32 6
+ %tmp9 = insertelement <8 x i8> %tmp8, i8 %tmp1, i32 7
+ %tmp10 = getelementptr i8* %bar, i64 %inc
+ store i8* %tmp10, i8** %ptr
+ ret <8 x i8> %tmp9
+}
+
+define <8 x i16> @test_v8i16_post_imm_ld1r(i16* %bar, i16** %ptr) {
+; CHECK-LABEL: test_v8i16_post_imm_ld1r:
+; CHECK: ld1r.8h { v0 }, [x0], #2
+ %tmp1 = load i16* %bar
+ %tmp2 = insertelement <8 x i16> <i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, i16 %tmp1, i32 0
+ %tmp3 = insertelement <8 x i16> %tmp2, i16 %tmp1, i32 1
+ %tmp4 = insertelement <8 x i16> %tmp3, i16 %tmp1, i32 2
+ %tmp5 = insertelement <8 x i16> %tmp4, i16 %tmp1, i32 3
+ %tmp6 = insertelement <8 x i16> %tmp5, i16 %tmp1, i32 4
+ %tmp7 = insertelement <8 x i16> %tmp6, i16 %tmp1, i32 5
+ %tmp8 = insertelement <8 x i16> %tmp7, i16 %tmp1, i32 6
+ %tmp9 = insertelement <8 x i16> %tmp8, i16 %tmp1, i32 7
+ %tmp10 = getelementptr i16* %bar, i64 1
+ store i16* %tmp10, i16** %ptr
+ ret <8 x i16> %tmp9
+}
+
+define <8 x i16> @test_v8i16_post_reg_ld1r(i16* %bar, i16** %ptr, i64 %inc) {
+; CHECK-LABEL: test_v8i16_post_reg_ld1r:
+; CHECK: ld1r.8h { v0 }, [x0], x{{[0-9]+}}
+ %tmp1 = load i16* %bar
+ %tmp2 = insertelement <8 x i16> <i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, i16 %tmp1, i32 0
+ %tmp3 = insertelement <8 x i16> %tmp2, i16 %tmp1, i32 1
+ %tmp4 = insertelement <8 x i16> %tmp3, i16 %tmp1, i32 2
+ %tmp5 = insertelement <8 x i16> %tmp4, i16 %tmp1, i32 3
+ %tmp6 = insertelement <8 x i16> %tmp5, i16 %tmp1, i32 4
+ %tmp7 = insertelement <8 x i16> %tmp6, i16 %tmp1, i32 5
+ %tmp8 = insertelement <8 x i16> %tmp7, i16 %tmp1, i32 6
+ %tmp9 = insertelement <8 x i16> %tmp8, i16 %tmp1, i32 7
+ %tmp10 = getelementptr i16* %bar, i64 %inc
+ store i16* %tmp10, i16** %ptr
+ ret <8 x i16> %tmp9
+}
+
+define <4 x i16> @test_v4i16_post_imm_ld1r(i16* %bar, i16** %ptr) {
+; CHECK-LABEL: test_v4i16_post_imm_ld1r:
+; CHECK: ld1r.4h { v0 }, [x0], #2
+ %tmp1 = load i16* %bar
+ %tmp2 = insertelement <4 x i16> <i16 undef, i16 undef, i16 undef, i16 undef>, i16 %tmp1, i32 0
+ %tmp3 = insertelement <4 x i16> %tmp2, i16 %tmp1, i32 1
+ %tmp4 = insertelement <4 x i16> %tmp3, i16 %tmp1, i32 2
+ %tmp5 = insertelement <4 x i16> %tmp4, i16 %tmp1, i32 3
+ %tmp6 = getelementptr i16* %bar, i64 1
+ store i16* %tmp6, i16** %ptr
+ ret <4 x i16> %tmp5
+}
+
+define <4 x i16> @test_v4i16_post_reg_ld1r(i16* %bar, i16** %ptr, i64 %inc) {
+; CHECK-LABEL: test_v4i16_post_reg_ld1r:
+; CHECK: ld1r.4h { v0 }, [x0], x{{[0-9]+}}
+ %tmp1 = load i16* %bar
+ %tmp2 = insertelement <4 x i16> <i16 undef, i16 undef, i16 undef, i16 undef>, i16 %tmp1, i32 0
+ %tmp3 = insertelement <4 x i16> %tmp2, i16 %tmp1, i32 1
+ %tmp4 = insertelement <4 x i16> %tmp3, i16 %tmp1, i32 2
+ %tmp5 = insertelement <4 x i16> %tmp4, i16 %tmp1, i32 3
+ %tmp6 = getelementptr i16* %bar, i64 %inc
+ store i16* %tmp6, i16** %ptr
+ ret <4 x i16> %tmp5
+}
+
+define <4 x i32> @test_v4i32_post_imm_ld1r(i32* %bar, i32** %ptr) {
+; CHECK-LABEL: test_v4i32_post_imm_ld1r:
+; CHECK: ld1r.4s { v0 }, [x0], #4
+ %tmp1 = load i32* %bar
+ %tmp2 = insertelement <4 x i32> <i32 undef, i32 undef, i32 undef, i32 undef>, i32 %tmp1, i32 0
+ %tmp3 = insertelement <4 x i32> %tmp2, i32 %tmp1, i32 1
+ %tmp4 = insertelement <4 x i32> %tmp3, i32 %tmp1, i32 2
+ %tmp5 = insertelement <4 x i32> %tmp4, i32 %tmp1, i32 3
+ %tmp6 = getelementptr i32* %bar, i64 1
+ store i32* %tmp6, i32** %ptr
+ ret <4 x i32> %tmp5
+}
+
+define <4 x i32> @test_v4i32_post_reg_ld1r(i32* %bar, i32** %ptr, i64 %inc) {
+; CHECK-LABEL: test_v4i32_post_reg_ld1r:
+; CHECK: ld1r.4s { v0 }, [x0], x{{[0-9]+}}
+ %tmp1 = load i32* %bar
+ %tmp2 = insertelement <4 x i32> <i32 undef, i32 undef, i32 undef, i32 undef>, i32 %tmp1, i32 0
+ %tmp3 = insertelement <4 x i32> %tmp2, i32 %tmp1, i32 1
+ %tmp4 = insertelement <4 x i32> %tmp3, i32 %tmp1, i32 2
+ %tmp5 = insertelement <4 x i32> %tmp4, i32 %tmp1, i32 3
+ %tmp6 = getelementptr i32* %bar, i64 %inc
+ store i32* %tmp6, i32** %ptr
+ ret <4 x i32> %tmp5
+}
+
+define <2 x i32> @test_v2i32_post_imm_ld1r(i32* %bar, i32** %ptr) {
+; CHECK-LABEL: test_v2i32_post_imm_ld1r:
+; CHECK: ld1r.2s { v0 }, [x0], #4
+ %tmp1 = load i32* %bar
+ %tmp2 = insertelement <2 x i32> <i32 undef, i32 undef>, i32 %tmp1, i32 0
+ %tmp3 = insertelement <2 x i32> %tmp2, i32 %tmp1, i32 1
+ %tmp4 = getelementptr i32* %bar, i64 1
+ store i32* %tmp4, i32** %ptr
+ ret <2 x i32> %tmp3
+}
+
+define <2 x i32> @test_v2i32_post_reg_ld1r(i32* %bar, i32** %ptr, i64 %inc) {
+; CHECK-LABEL: test_v2i32_post_reg_ld1r:
+; CHECK: ld1r.2s { v0 }, [x0], x{{[0-9]+}}
+ %tmp1 = load i32* %bar
+ %tmp2 = insertelement <2 x i32> <i32 undef, i32 undef>, i32 %tmp1, i32 0
+ %tmp3 = insertelement <2 x i32> %tmp2, i32 %tmp1, i32 1
+ %tmp4 = getelementptr i32* %bar, i64 %inc
+ store i32* %tmp4, i32** %ptr
+ ret <2 x i32> %tmp3
+}
+
+define <2 x i64> @test_v2i64_post_imm_ld1r(i64* %bar, i64** %ptr) {
+; CHECK-LABEL: test_v2i64_post_imm_ld1r:
+; CHECK: ld1r.2d { v0 }, [x0], #8
+ %tmp1 = load i64* %bar
+ %tmp2 = insertelement <2 x i64> <i64 undef, i64 undef>, i64 %tmp1, i32 0
+ %tmp3 = insertelement <2 x i64> %tmp2, i64 %tmp1, i32 1
+ %tmp4 = getelementptr i64* %bar, i64 1
+ store i64* %tmp4, i64** %ptr
+ ret <2 x i64> %tmp3
+}
+
+define <2 x i64> @test_v2i64_post_reg_ld1r(i64* %bar, i64** %ptr, i64 %inc) {
+; CHECK-LABEL: test_v2i64_post_reg_ld1r:
+; CHECK: ld1r.2d { v0 }, [x0], x{{[0-9]+}}
+ %tmp1 = load i64* %bar
+ %tmp2 = insertelement <2 x i64> <i64 undef, i64 undef>, i64 %tmp1, i32 0
+ %tmp3 = insertelement <2 x i64> %tmp2, i64 %tmp1, i32 1
+ %tmp4 = getelementptr i64* %bar, i64 %inc
+ store i64* %tmp4, i64** %ptr
+ ret <2 x i64> %tmp3
+}
+
+define <4 x float> @test_v4f32_post_imm_ld1r(float* %bar, float** %ptr) {
+; CHECK-LABEL: test_v4f32_post_imm_ld1r:
+; CHECK: ld1r.4s { v0 }, [x0], #4
+ %tmp1 = load float* %bar
+ %tmp2 = insertelement <4 x float> <float undef, float undef, float undef, float undef>, float %tmp1, i32 0
+ %tmp3 = insertelement <4 x float> %tmp2, float %tmp1, i32 1
+ %tmp4 = insertelement <4 x float> %tmp3, float %tmp1, i32 2
+ %tmp5 = insertelement <4 x float> %tmp4, float %tmp1, i32 3
+ %tmp6 = getelementptr float* %bar, i64 1
+ store float* %tmp6, float** %ptr
+ ret <4 x float> %tmp5
+}
+
+define <4 x float> @test_v4f32_post_reg_ld1r(float* %bar, float** %ptr, i64 %inc) {
+; CHECK-LABEL: test_v4f32_post_reg_ld1r:
+; CHECK: ld1r.4s { v0 }, [x0], x{{[0-9]+}}
+ %tmp1 = load float* %bar
+ %tmp2 = insertelement <4 x float> <float undef, float undef, float undef, float undef>, float %tmp1, i32 0
+ %tmp3 = insertelement <4 x float> %tmp2, float %tmp1, i32 1
+ %tmp4 = insertelement <4 x float> %tmp3, float %tmp1, i32 2
+ %tmp5 = insertelement <4 x float> %tmp4, float %tmp1, i32 3
+ %tmp6 = getelementptr float* %bar, i64 %inc
+ store float* %tmp6, float** %ptr
+ ret <4 x float> %tmp5
+}
+
+define <2 x float> @test_v2f32_post_imm_ld1r(float* %bar, float** %ptr) {
+; CHECK-LABEL: test_v2f32_post_imm_ld1r:
+; CHECK: ld1r.2s { v0 }, [x0], #4
+ %tmp1 = load float* %bar
+ %tmp2 = insertelement <2 x float> <float undef, float undef>, float %tmp1, i32 0
+ %tmp3 = insertelement <2 x float> %tmp2, float %tmp1, i32 1
+ %tmp4 = getelementptr float* %bar, i64 1
+ store float* %tmp4, float** %ptr
+ ret <2 x float> %tmp3
+}
+
+define <2 x float> @test_v2f32_post_reg_ld1r(float* %bar, float** %ptr, i64 %inc) {
+; CHECK-LABEL: test_v2f32_post_reg_ld1r:
+; CHECK: ld1r.2s { v0 }, [x0], x{{[0-9]+}}
+ %tmp1 = load float* %bar
+ %tmp2 = insertelement <2 x float> <float undef, float undef>, float %tmp1, i32 0
+ %tmp3 = insertelement <2 x float> %tmp2, float %tmp1, i32 1
+ %tmp4 = getelementptr float* %bar, i64 %inc
+ store float* %tmp4, float** %ptr
+ ret <2 x float> %tmp3
+}
+
+define <2 x double> @test_v2f64_post_imm_ld1r(double* %bar, double** %ptr) {
+; CHECK-LABEL: test_v2f64_post_imm_ld1r:
+; CHECK: ld1r.2d { v0 }, [x0], #8
+ %tmp1 = load double* %bar
+ %tmp2 = insertelement <2 x double> <double undef, double undef>, double %tmp1, i32 0
+ %tmp3 = insertelement <2 x double> %tmp2, double %tmp1, i32 1
+ %tmp4 = getelementptr double* %bar, i64 1
+ store double* %tmp4, double** %ptr
+ ret <2 x double> %tmp3
+}
+
+define <2 x double> @test_v2f64_post_reg_ld1r(double* %bar, double** %ptr, i64 %inc) {
+; CHECK-LABEL: test_v2f64_post_reg_ld1r:
+; CHECK: ld1r.2d { v0 }, [x0], x{{[0-9]+}}
+ %tmp1 = load double* %bar
+ %tmp2 = insertelement <2 x double> <double undef, double undef>, double %tmp1, i32 0
+ %tmp3 = insertelement <2 x double> %tmp2, double %tmp1, i32 1
+ %tmp4 = getelementptr double* %bar, i64 %inc
+ store double* %tmp4, double** %ptr
+ ret <2 x double> %tmp3
+}
+
+define <16 x i8> @test_v16i8_post_imm_ld1lane(i8* %bar, i8** %ptr, <16 x i8> %A) {
+; CHECK-LABEL: test_v16i8_post_imm_ld1lane:
+; CHECK: ld1.b { v0 }[1], [x0], #1
+ %tmp1 = load i8* %bar
+ %tmp2 = insertelement <16 x i8> %A, i8 %tmp1, i32 1
+ %tmp3 = getelementptr i8* %bar, i64 1
+ store i8* %tmp3, i8** %ptr
+ ret <16 x i8> %tmp2
+}
+
+define <16 x i8> @test_v16i8_post_reg_ld1lane(i8* %bar, i8** %ptr, i64 %inc, <16 x i8> %A) {
+; CHECK-LABEL: test_v16i8_post_reg_ld1lane:
+; CHECK: ld1.b { v0 }[1], [x0], x{{[0-9]+}}
+ %tmp1 = load i8* %bar
+ %tmp2 = insertelement <16 x i8> %A, i8 %tmp1, i32 1
+ %tmp3 = getelementptr i8* %bar, i64 %inc
+ store i8* %tmp3, i8** %ptr
+ ret <16 x i8> %tmp2
+}
+
+define <8 x i8> @test_v8i8_post_imm_ld1lane(i8* %bar, i8** %ptr, <8 x i8> %A) {
+; CHECK-LABEL: test_v8i8_post_imm_ld1lane:
+; CHECK: ld1.b { v0 }[1], [x0], #1
+ %tmp1 = load i8* %bar
+ %tmp2 = insertelement <8 x i8> %A, i8 %tmp1, i32 1
+ %tmp3 = getelementptr i8* %bar, i64 1
+ store i8* %tmp3, i8** %ptr
+ ret <8 x i8> %tmp2
+}
+
+define <8 x i8> @test_v8i8_post_reg_ld1lane(i8* %bar, i8** %ptr, i64 %inc, <8 x i8> %A) {
+; CHECK-LABEL: test_v8i8_post_reg_ld1lane:
+; CHECK: ld1.b { v0 }[1], [x0], x{{[0-9]+}}
+ %tmp1 = load i8* %bar
+ %tmp2 = insertelement <8 x i8> %A, i8 %tmp1, i32 1
+ %tmp3 = getelementptr i8* %bar, i64 %inc
+ store i8* %tmp3, i8** %ptr
+ ret <8 x i8> %tmp2
+}
+
+define <8 x i16> @test_v8i16_post_imm_ld1lane(i16* %bar, i16** %ptr, <8 x i16> %A) {
+; CHECK-LABEL: test_v8i16_post_imm_ld1lane:
+; CHECK: ld1.h { v0 }[1], [x0], #2
+ %tmp1 = load i16* %bar
+ %tmp2 = insertelement <8 x i16> %A, i16 %tmp1, i32 1
+ %tmp3 = getelementptr i16* %bar, i64 1
+ store i16* %tmp3, i16** %ptr
+ ret <8 x i16> %tmp2
+}
+
+define <8 x i16> @test_v8i16_post_reg_ld1lane(i16* %bar, i16** %ptr, i64 %inc, <8 x i16> %A) {
+; CHECK-LABEL: test_v8i16_post_reg_ld1lane:
+; CHECK: ld1.h { v0 }[1], [x0], x{{[0-9]+}}
+ %tmp1 = load i16* %bar
+ %tmp2 = insertelement <8 x i16> %A, i16 %tmp1, i32 1
+ %tmp3 = getelementptr i16* %bar, i64 %inc
+ store i16* %tmp3, i16** %ptr
+ ret <8 x i16> %tmp2
+}
+
+define <4 x i16> @test_v4i16_post_imm_ld1lane(i16* %bar, i16** %ptr, <4 x i16> %A) {
+; CHECK-LABEL: test_v4i16_post_imm_ld1lane:
+; CHECK: ld1.h { v0 }[1], [x0], #2
+ %tmp1 = load i16* %bar
+ %tmp2 = insertelement <4 x i16> %A, i16 %tmp1, i32 1
+ %tmp3 = getelementptr i16* %bar, i64 1
+ store i16* %tmp3, i16** %ptr
+ ret <4 x i16> %tmp2
+}
+
+define <4 x i16> @test_v4i16_post_reg_ld1lane(i16* %bar, i16** %ptr, i64 %inc, <4 x i16> %A) {
+; CHECK-LABEL: test_v4i16_post_reg_ld1lane:
+; CHECK: ld1.h { v0 }[1], [x0], x{{[0-9]+}}
+ %tmp1 = load i16* %bar
+ %tmp2 = insertelement <4 x i16> %A, i16 %tmp1, i32 1
+ %tmp3 = getelementptr i16* %bar, i64 %inc
+ store i16* %tmp3, i16** %ptr
+ ret <4 x i16> %tmp2
+}
+
+define <4 x i32> @test_v4i32_post_imm_ld1lane(i32* %bar, i32** %ptr, <4 x i32> %A) {
+; CHECK-LABEL: test_v4i32_post_imm_ld1lane:
+; CHECK: ld1.s { v0 }[1], [x0], #4
+ %tmp1 = load i32* %bar
+ %tmp2 = insertelement <4 x i32> %A, i32 %tmp1, i32 1
+ %tmp3 = getelementptr i32* %bar, i64 1
+ store i32* %tmp3, i32** %ptr
+ ret <4 x i32> %tmp2
+}
+
+define <4 x i32> @test_v4i32_post_reg_ld1lane(i32* %bar, i32** %ptr, i64 %inc, <4 x i32> %A) {
+; CHECK-LABEL: test_v4i32_post_reg_ld1lane:
+; CHECK: ld1.s { v0 }[1], [x0], x{{[0-9]+}}
+ %tmp1 = load i32* %bar
+ %tmp2 = insertelement <4 x i32> %A, i32 %tmp1, i32 1
+ %tmp3 = getelementptr i32* %bar, i64 %inc
+ store i32* %tmp3, i32** %ptr
+ ret <4 x i32> %tmp2
+}
+
+define <2 x i32> @test_v2i32_post_imm_ld1lane(i32* %bar, i32** %ptr, <2 x i32> %A) {
+; CHECK-LABEL: test_v2i32_post_imm_ld1lane:
+; CHECK: ld1.s { v0 }[1], [x0], #4
+ %tmp1 = load i32* %bar
+ %tmp2 = insertelement <2 x i32> %A, i32 %tmp1, i32 1
+ %tmp3 = getelementptr i32* %bar, i64 1
+ store i32* %tmp3, i32** %ptr
+ ret <2 x i32> %tmp2
+}
+
+define <2 x i32> @test_v2i32_post_reg_ld1lane(i32* %bar, i32** %ptr, i64 %inc, <2 x i32> %A) {
+; CHECK-LABEL: test_v2i32_post_reg_ld1lane:
+; CHECK: ld1.s { v0 }[1], [x0], x{{[0-9]+}}
+ %tmp1 = load i32* %bar
+ %tmp2 = insertelement <2 x i32> %A, i32 %tmp1, i32 1
+ %tmp3 = getelementptr i32* %bar, i64 %inc
+ store i32* %tmp3, i32** %ptr
+ ret <2 x i32> %tmp2
+}
+
+define <2 x i64> @test_v2i64_post_imm_ld1lane(i64* %bar, i64** %ptr, <2 x i64> %A) {
+; CHECK-LABEL: test_v2i64_post_imm_ld1lane:
+; CHECK: ld1.d { v0 }[1], [x0], #8
+ %tmp1 = load i64* %bar
+ %tmp2 = insertelement <2 x i64> %A, i64 %tmp1, i32 1
+ %tmp3 = getelementptr i64* %bar, i64 1
+ store i64* %tmp3, i64** %ptr
+ ret <2 x i64> %tmp2
+}
+
+define <2 x i64> @test_v2i64_post_reg_ld1lane(i64* %bar, i64** %ptr, i64 %inc, <2 x i64> %A) {
+; CHECK-LABEL: test_v2i64_post_reg_ld1lane:
+; CHECK: ld1.d { v0 }[1], [x0], x{{[0-9]+}}
+ %tmp1 = load i64* %bar
+ %tmp2 = insertelement <2 x i64> %A, i64 %tmp1, i32 1
+ %tmp3 = getelementptr i64* %bar, i64 %inc
+ store i64* %tmp3, i64** %ptr
+ ret <2 x i64> %tmp2
+}
+
+define <4 x float> @test_v4f32_post_imm_ld1lane(float* %bar, float** %ptr, <4 x float> %A) {
+; CHECK-LABEL: test_v4f32_post_imm_ld1lane:
+; CHECK: ld1.s { v0 }[1], [x0], #4
+ %tmp1 = load float* %bar
+ %tmp2 = insertelement <4 x float> %A, float %tmp1, i32 1
+ %tmp3 = getelementptr float* %bar, i64 1
+ store float* %tmp3, float** %ptr
+ ret <4 x float> %tmp2
+}
+
+define <4 x float> @test_v4f32_post_reg_ld1lane(float* %bar, float** %ptr, i64 %inc, <4 x float> %A) {
+; CHECK-LABEL: test_v4f32_post_reg_ld1lane:
+; CHECK: ld1.s { v0 }[1], [x0], x{{[0-9]+}}
+ %tmp1 = load float* %bar
+ %tmp2 = insertelement <4 x float> %A, float %tmp1, i32 1
+ %tmp3 = getelementptr float* %bar, i64 %inc
+ store float* %tmp3, float** %ptr
+ ret <4 x float> %tmp2
+}
+
+define <2 x float> @test_v2f32_post_imm_ld1lane(float* %bar, float** %ptr, <2 x float> %A) {
+; CHECK-LABEL: test_v2f32_post_imm_ld1lane:
+; CHECK: ld1.s { v0 }[1], [x0], #4
+ %tmp1 = load float* %bar
+ %tmp2 = insertelement <2 x float> %A, float %tmp1, i32 1
+ %tmp3 = getelementptr float* %bar, i64 1
+ store float* %tmp3, float** %ptr
+ ret <2 x float> %tmp2
+}
+
+define <2 x float> @test_v2f32_post_reg_ld1lane(float* %bar, float** %ptr, i64 %inc, <2 x float> %A) {
+; CHECK-LABEL: test_v2f32_post_reg_ld1lane:
+; CHECK: ld1.s { v0 }[1], [x0], x{{[0-9]+}}
+ %tmp1 = load float* %bar
+ %tmp2 = insertelement <2 x float> %A, float %tmp1, i32 1
+ %tmp3 = getelementptr float* %bar, i64 %inc
+ store float* %tmp3, float** %ptr
+ ret <2 x float> %tmp2
+}
+
+define <2 x double> @test_v2f64_post_imm_ld1lane(double* %bar, double** %ptr, <2 x double> %A) {
+; CHECK-LABEL: test_v2f64_post_imm_ld1lane:
+; CHECK: ld1.d { v0 }[1], [x0], #8
+ %tmp1 = load double* %bar
+ %tmp2 = insertelement <2 x double> %A, double %tmp1, i32 1
+ %tmp3 = getelementptr double* %bar, i64 1
+ store double* %tmp3, double** %ptr
+ ret <2 x double> %tmp2
+}
+
+define <2 x double> @test_v2f64_post_reg_ld1lane(double* %bar, double** %ptr, i64 %inc, <2 x double> %A) {
+; CHECK-LABEL: test_v2f64_post_reg_ld1lane:
+; CHECK: ld1.d { v0 }[1], [x0], x{{[0-9]+}}
+ %tmp1 = load double* %bar
+ %tmp2 = insertelement <2 x double> %A, double %tmp1, i32 1
+ %tmp3 = getelementptr double* %bar, i64 %inc
+ store double* %tmp3, double** %ptr
+ ret <2 x double> %tmp2
+} \ No newline at end of file
diff --git a/test/CodeGen/AArch64/arm64-inline-asm-error-I.ll b/test/CodeGen/AArch64/arm64-inline-asm-error-I.ll
new file mode 100644
index 0000000..a7aaf9e
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-inline-asm-error-I.ll
@@ -0,0 +1,11 @@
+; RUN: not llc -march=arm64 < %s 2> %t
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+; Check for at least one invalid constant.
+; CHECK-ERRORS: error: invalid operand for inline asm constraint 'I'
+
+define i32 @constraint_I(i32 %i, i32 %j) nounwind ssp {
+entry:
+ %0 = tail call i32 asm sideeffect "add $0, $1, $2", "=r,r,I"(i32 %i, i32 4097) nounwind
+ ret i32 %0
+}
diff --git a/test/CodeGen/AArch64/arm64-inline-asm-error-J.ll b/test/CodeGen/AArch64/arm64-inline-asm-error-J.ll
new file mode 100644
index 0000000..077e1b8
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-inline-asm-error-J.ll
@@ -0,0 +1,11 @@
+; RUN: not llc -march=arm64 < %s 2> %t
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+; Check for at least one invalid constant.
+; CHECK-ERRORS: error: invalid operand for inline asm constraint 'J'
+
+define i32 @constraint_J(i32 %i, i32 %j) nounwind ssp {
+entry:
+ %0 = tail call i32 asm sideeffect "sub $0, $1, $2", "=r,r,J"(i32 %i, i32 2) nounwind
+ ret i32 %0
+}
diff --git a/test/CodeGen/AArch64/arm64-inline-asm-error-K.ll b/test/CodeGen/AArch64/arm64-inline-asm-error-K.ll
new file mode 100644
index 0000000..2a7f961
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-inline-asm-error-K.ll
@@ -0,0 +1,11 @@
+; RUN: not llc -march=arm64 < %s 2> %t
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+; Check for at least one invalid constant.
+; CHECK-ERRORS: error: invalid operand for inline asm constraint 'K'
+
+define i32 @constraint_K(i32 %i, i32 %j) nounwind {
+entry:
+ %0 = tail call i32 asm sideeffect "eor $0, $1, $2", "=r,r,K"(i32 %i, i32 -1) nounwind
+ ret i32 %0
+}
diff --git a/test/CodeGen/AArch64/arm64-inline-asm-error-L.ll b/test/CodeGen/AArch64/arm64-inline-asm-error-L.ll
new file mode 100644
index 0000000..1701943
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-inline-asm-error-L.ll
@@ -0,0 +1,11 @@
+; RUN: not llc -march=arm64 < %s 2> %t
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+; Check for at least one invalid constant.
+; CHECK-ERRORS: error: invalid operand for inline asm constraint 'L'
+
+define i32 @constraint_L(i32 %i, i32 %j) nounwind {
+entry:
+ %0 = tail call i32 asm sideeffect "eor $0, $1, $2", "=r,r,L"(i32 %i, i64 -1) nounwind
+ ret i32 %0
+}
diff --git a/test/CodeGen/AArch64/arm64-inline-asm-error-M.ll b/test/CodeGen/AArch64/arm64-inline-asm-error-M.ll
new file mode 100644
index 0000000..952bf60
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-inline-asm-error-M.ll
@@ -0,0 +1,11 @@
+; RUN: not llc -march=arm64 < %s 2> %t
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+; Check for at least one invalid constant.
+; CHECK-ERRORS: error: invalid operand for inline asm constraint 'M'
+
+define i32 @constraint_M(i32 %i, i32 %j) nounwind {
+entry:
+ %0 = tail call i32 asm sideeffect "movk $0, $1", "=r,M"(i32 305418240) nounwind
+ ret i32 %0
+}
diff --git a/test/CodeGen/AArch64/arm64-inline-asm-error-N.ll b/test/CodeGen/AArch64/arm64-inline-asm-error-N.ll
new file mode 100644
index 0000000..b4a199f
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-inline-asm-error-N.ll
@@ -0,0 +1,11 @@
+; RUN: not llc -march=arm64 < %s 2> %t
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+; Check for at least one invalid constant.
+; CHECK-ERRORS: error: invalid operand for inline asm constraint 'N'
+
+define i32 @constraint_N(i32 %i, i32 %j) nounwind {
+entry:
+ %0 = tail call i32 asm sideeffect "movk $0, $1", "=r,N"(i64 1311761352401879040) nounwind
+ ret i32 %0
+}
diff --git a/test/CodeGen/AArch64/arm64-inline-asm-zero-reg-error.ll b/test/CodeGen/AArch64/arm64-inline-asm-zero-reg-error.ll
new file mode 100644
index 0000000..6bfce8f
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-inline-asm-zero-reg-error.ll
@@ -0,0 +1,11 @@
+; RUN: not llc < %s -march=arm64 2>&1 | FileCheck %s
+
+
+; The 'z' constraint allocates either xzr or wzr, but obviously an input of 1 is
+; incompatible.
+define void @test_bad_zero_reg() {
+ tail call void asm sideeffect "USE($0)", "z"(i32 1) nounwind
+; CHECK: error: invalid operand for inline asm constraint 'z'
+
+ ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-inline-asm.ll b/test/CodeGen/AArch64/arm64-inline-asm.ll
new file mode 100644
index 0000000..d76cca3
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-inline-asm.ll
@@ -0,0 +1,230 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -no-integrated-as | FileCheck %s
+
+; rdar://9167275
+
+define i32 @t1() nounwind ssp {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: mov {{w[0-9]+}}, 7
+ %0 = tail call i32 asm "mov ${0:w}, 7", "=r"() nounwind
+ ret i32 %0
+}
+
+define i64 @t2() nounwind ssp {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: mov {{x[0-9]+}}, 7
+ %0 = tail call i64 asm "mov $0, 7", "=r"() nounwind
+ ret i64 %0
+}
+
+define i64 @t3() nounwind ssp {
+entry:
+; CHECK-LABEL: t3:
+; CHECK: mov {{w[0-9]+}}, 7
+ %0 = tail call i64 asm "mov ${0:w}, 7", "=r"() nounwind
+ ret i64 %0
+}
+
+; rdar://9281206
+
+define void @t4(i64 %op) nounwind {
+entry:
+; CHECK-LABEL: t4:
+; CHECK: mov x0, {{x[0-9]+}}; svc #0
+ %0 = tail call i64 asm sideeffect "mov x0, $1; svc #0;", "=r,r,r,~{x0}"(i64 %op, i64 undef) nounwind
+ ret void
+}
+
+; rdar://9394290
+
+define float @t5(float %x) nounwind {
+entry:
+; CHECK-LABEL: t5:
+; CHECK: fadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+ %0 = tail call float asm "fadd ${0:s}, ${0:s}, ${0:s}", "=w,0"(float %x) nounwind
+ ret float %0
+}
+
+; rdar://9553599
+
+define zeroext i8 @t6(i8* %src) nounwind {
+entry:
+; CHECK-LABEL: t6:
+; CHECK: ldtrb {{w[0-9]+}}, [{{x[0-9]+}}]
+ %0 = tail call i8 asm "ldtrb ${0:w}, [$1]", "=r,r"(i8* %src) nounwind
+ ret i8 %0
+}
+
+define void @t7(i8* %f, i32 %g) nounwind {
+entry:
+ %f.addr = alloca i8*, align 8
+ store i8* %f, i8** %f.addr, align 8
+ ; CHECK-LABEL: t7:
+ ; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}]
+ call void asm "str ${1:w}, $0", "=*Q,r"(i8** %f.addr, i32 %g) nounwind
+ ret void
+}
+
+; rdar://10258229
+; ARM64TargetLowering::getRegForInlineAsmConstraint() should recognize 'v'
+; registers.
+define void @t8() nounwind ssp {
+entry:
+; CHECK-LABEL: t8:
+; CHECK: stp {{d[0-9]+}}, {{d[0-9]+}}, [sp, #-16]
+ tail call void asm sideeffect "nop", "~{v8}"() nounwind
+ ret void
+}
+
+define i32 @constraint_I(i32 %i, i32 %j) nounwind {
+entry:
+ ; CHECK-LABEL: constraint_I:
+ %0 = tail call i32 asm sideeffect "add ${0:w}, ${1:w}, $2", "=r,r,I"(i32 %i, i32 16773120) nounwind
+ ; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, #16773120
+ %1 = tail call i32 asm sideeffect "add ${0:w}, ${1:w}, $2", "=r,r,I"(i32 %i, i32 4096) nounwind
+ ; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, #4096
+ ret i32 %1
+}
+
+define i32 @constraint_J(i32 %i, i32 %j) nounwind {
+entry:
+ ; CHECK-LABEL: constraint_J:
+ %0 = tail call i32 asm sideeffect "sub ${0:w}, ${1:w}, $2", "=r,r,J"(i32 %i, i32 -16773120) nounwind
+ ; CHECK: sub {{w[0-9]+}}, {{w[0-9]+}}, #4278194176
+ %1 = tail call i32 asm sideeffect "sub ${0:w}, ${1:w}, $2", "=r,r,J"(i32 %i, i32 -1) nounwind
+ ; CHECK: sub {{w[0-9]+}}, {{w[0-9]+}}, #4294967295
+ ret i32 %1
+}
+
+define i32 @constraint_KL(i32 %i, i32 %j) nounwind {
+entry:
+ ; CHECK-LABEL: constraint_KL:
+ %0 = tail call i32 asm sideeffect "eor ${0:w}, ${1:w}, $2", "=r,r,K"(i32 %i, i32 255) nounwind
+ ; CHECK: eor {{w[0-9]+}}, {{w[0-9]+}}, #255
+ %1 = tail call i32 asm sideeffect "eor ${0:w}, ${1:w}, $2", "=r,r,L"(i32 %i, i64 16711680) nounwind
+ ; CHECK: eor {{w[0-9]+}}, {{w[0-9]+}}, #16711680
+ ret i32 %1
+}
+
+define i32 @constraint_MN(i32 %i, i32 %j) nounwind {
+entry:
+ ; CHECK-LABEL: constraint_MN:
+ %0 = tail call i32 asm sideeffect "movk ${0:w}, $1", "=r,M"(i32 65535) nounwind
+ ; CHECK: movk {{w[0-9]+}}, #65535
+ %1 = tail call i32 asm sideeffect "movz ${0:w}, $1", "=r,N"(i64 0) nounwind
+ ; CHECK: movz {{w[0-9]+}}, #0
+ ret i32 %1
+}
+
+define void @t9() nounwind {
+entry:
+ ; CHECK-LABEL: t9:
+ %data = alloca <2 x double>, align 16
+ %0 = load <2 x double>* %data, align 16
+ call void asm sideeffect "mov.2d v4, $0\0A", "w,~{v4}"(<2 x double> %0) nounwind
+ ; CHECK: mov.2d v4, {{v[0-9]+}}
+ ret void
+}
+
+define void @t10() nounwind {
+entry:
+ ; CHECK-LABEL: t10:
+ %data = alloca <2 x float>, align 8
+ %a = alloca [2 x float], align 4
+ %arraydecay = getelementptr inbounds [2 x float]* %a, i32 0, i32 0
+ %0 = load <2 x float>* %data, align 8
+ call void asm sideeffect "ldr ${1:q}, [$0]\0A", "r,w"(float* %arraydecay, <2 x float> %0) nounwind
+ ; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}]
+ call void asm sideeffect "ldr ${1:d}, [$0]\0A", "r,w"(float* %arraydecay, <2 x float> %0) nounwind
+ ; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}]
+ call void asm sideeffect "ldr ${1:s}, [$0]\0A", "r,w"(float* %arraydecay, <2 x float> %0) nounwind
+ ; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}]
+ call void asm sideeffect "ldr ${1:h}, [$0]\0A", "r,w"(float* %arraydecay, <2 x float> %0) nounwind
+ ; CHECK: ldr {{h[0-9]+}}, [{{x[0-9]+}}]
+ call void asm sideeffect "ldr ${1:b}, [$0]\0A", "r,w"(float* %arraydecay, <2 x float> %0) nounwind
+ ; CHECK: ldr {{b[0-9]+}}, [{{x[0-9]+}}]
+ ret void
+}
+
+define void @t11() nounwind {
+entry:
+ ; CHECK-LABEL: t11:
+ %a = alloca i32, align 4
+ %0 = load i32* %a, align 4
+ call void asm sideeffect "mov ${1:x}, ${0:x}\0A", "r,i"(i32 %0, i32 0) nounwind
+ ; CHECK: mov xzr, {{x[0-9]+}}
+ %1 = load i32* %a, align 4
+ call void asm sideeffect "mov ${1:w}, ${0:w}\0A", "r,i"(i32 %1, i32 0) nounwind
+ ; CHECK: mov wzr, {{w[0-9]+}}
+ ret void
+}
+
+define void @t12() nounwind {
+entry:
+ ; CHECK-LABEL: t12:
+ %data = alloca <4 x float>, align 16
+ %0 = load <4 x float>* %data, align 16
+ call void asm sideeffect "mov.2d v4, $0\0A", "x,~{v4}"(<4 x float> %0) nounwind
+ ; CHECK mov.2d v4, {{v([0-9])|(1[0-5])}}
+ ret void
+}
+
+define void @t13() nounwind {
+entry:
+ ; CHECK-LABEL: t13:
+ tail call void asm sideeffect "mov x4, $0\0A", "N"(i64 1311673391471656960) nounwind
+ ; CHECK: mov x4, #1311673391471656960
+ tail call void asm sideeffect "mov x4, $0\0A", "N"(i64 -4662) nounwind
+ ; CHECK: mov x4, #-4662
+ tail call void asm sideeffect "mov x4, $0\0A", "N"(i64 4660) nounwind
+ ; CHECK: mov x4, #4660
+ call void asm sideeffect "mov x4, $0\0A", "N"(i64 -71777214294589696) nounwind
+ ; CHECK: mov x4, #-71777214294589696
+ ret void
+}
+
+define void @t14() nounwind {
+entry:
+ ; CHECK-LABEL: t14:
+ tail call void asm sideeffect "mov w4, $0\0A", "M"(i32 305397760) nounwind
+ ; CHECK: mov w4, #305397760
+ tail call void asm sideeffect "mov w4, $0\0A", "M"(i32 -4662) nounwind
+ ; CHECK: mov w4, #4294962634
+ tail call void asm sideeffect "mov w4, $0\0A", "M"(i32 4660) nounwind
+ ; CHECK: mov w4, #4660
+ call void asm sideeffect "mov w4, $0\0A", "M"(i32 -16711936) nounwind
+ ; CHECK: mov w4, #4278255360
+ ret void
+}
+
+define void @t15() nounwind {
+entry:
+ %0 = tail call double asm sideeffect "fmov $0, d8", "=r"() nounwind
+ ; CHECK: fmov {{x[0-9]+}}, d8
+ ret void
+}
+
+; rdar://problem/14285178
+
+define void @test_zero_reg(i32* %addr) {
+; CHECK-LABEL: test_zero_reg:
+
+ tail call void asm sideeffect "USE($0)", "z"(i32 0) nounwind
+; CHECK: USE(xzr)
+
+ tail call void asm sideeffect "USE(${0:w})", "zr"(i32 0)
+; CHECK: USE(wzr)
+
+ tail call void asm sideeffect "USE(${0:w})", "zr"(i32 1)
+; CHECK: orr [[VAL1:w[0-9]+]], wzr, #0x1
+; CHECK: USE([[VAL1]])
+
+ tail call void asm sideeffect "USE($0), USE($1)", "z,z"(i32 0, i32 0) nounwind
+; CHECK: USE(xzr), USE(xzr)
+
+ tail call void asm sideeffect "USE($0), USE(${1:w})", "z,z"(i32 0, i32 0) nounwind
+; CHECK: USE(xzr), USE(wzr)
+
+ ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-join-reserved.ll b/test/CodeGen/AArch64/arm64-join-reserved.ll
new file mode 100644
index 0000000..e99168b
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-join-reserved.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -verify-machineinstrs | FileCheck %s
+target triple = "arm64-apple-macosx10"
+
+; Make sure that a store to [sp] addresses off sp directly.
+; A move isn't necessary.
+; <rdar://problem/11492712>
+; CHECK-LABEL: g:
+; CHECK: str xzr, [sp]
+; CHECK: bl
+; CHECK: ret
+define void @g() nounwind ssp {
+entry:
+ tail call void (i32, ...)* @f(i32 0, i32 0) nounwind
+ ret void
+}
+
+declare void @f(i32, ...)
diff --git a/test/CodeGen/AArch64/arm64-jumptable.ll b/test/CodeGen/AArch64/arm64-jumptable.ll
new file mode 100644
index 0000000..4635cfe
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-jumptable.ll
@@ -0,0 +1,35 @@
+; RUN: llc -mtriple=arm64-apple-ios < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu < %s | FileCheck %s --check-prefix=CHECK-LINUX
+; <rdar://11417675>
+
+define void @sum(i32* %to) {
+entry:
+ switch i32 undef, label %exit [
+ i32 1, label %bb1
+ i32 2, label %bb2
+ i32 3, label %bb3
+ i32 4, label %bb4
+ ]
+bb1:
+ store i32 undef, i32* %to
+ br label %exit
+bb2:
+ store i32 undef, i32* %to
+ br label %exit
+bb3:
+ store i32 undef, i32* %to
+ br label %exit
+bb4:
+ store i32 undef, i32* %to
+ br label %exit
+exit:
+ ret void
+}
+
+; CHECK-LABEL: sum:
+; CHECK: adrp {{x[0-9]+}}, LJTI0_0@PAGE
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, LJTI0_0@PAGEOFF
+
+; CHECK-LINUX-LABEL: sum:
+; CHECK-LINUX: adrp {{x[0-9]+}}, .LJTI0_0
+; CHECK-LINUX: add {{x[0-9]+}}, {{x[0-9]+}}, :lo12:.LJTI0_0
diff --git a/test/CodeGen/AArch64/arm64-large-frame.ll b/test/CodeGen/AArch64/arm64-large-frame.ll
new file mode 100644
index 0000000..5a53da6
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-large-frame.ll
@@ -0,0 +1,69 @@
+; RUN: llc -verify-machineinstrs -mtriple=arm64-none-linux-gnu -disable-fp-elim < %s | FileCheck %s
+declare void @use_addr(i8*)
+
+@addr = global i8* null
+
+define void @test_bigframe() {
+; CHECK-LABEL: test_bigframe:
+; CHECK: .cfi_startproc
+
+ %var1 = alloca i8, i32 20000000
+ %var2 = alloca i8, i32 16
+ %var3 = alloca i8, i32 20000000
+
+; CHECK: sub sp, sp, #4095, lsl #12
+; CHECK: sub sp, sp, #4095, lsl #12
+; CHECK: sub sp, sp, #1575, lsl #12
+; CHECK: sub sp, sp, #2576
+; CHECK: .cfi_def_cfa_offset 40000032
+
+
+; CHECK: add [[TMP:x[0-9]+]], sp, #4095, lsl #12
+; CHECK: add [[TMP1:x[0-9]+]], [[TMP]], #787, lsl #12
+; CHECK: add {{x[0-9]+}}, [[TMP1]], #3344
+ store volatile i8* %var1, i8** @addr
+
+ %var1plus2 = getelementptr i8* %var1, i32 2
+ store volatile i8* %var1plus2, i8** @addr
+
+; CHECK: add [[TMP:x[0-9]+]], sp, #4095, lsl #12
+; CHECK: add [[TMP1:x[0-9]+]], [[TMP]], #787, lsl #12
+; CHECK: add {{x[0-9]+}}, [[TMP1]], #3328
+ store volatile i8* %var2, i8** @addr
+
+ %var2plus2 = getelementptr i8* %var2, i32 2
+ store volatile i8* %var2plus2, i8** @addr
+
+ store volatile i8* %var3, i8** @addr
+
+ %var3plus2 = getelementptr i8* %var3, i32 2
+ store volatile i8* %var3plus2, i8** @addr
+
+; CHECK: add sp, sp, #4095, lsl #12
+; CHECK: add sp, sp, #4095, lsl #12
+; CHECK: add sp, sp, #1575, lsl #12
+; CHECK: add sp, sp, #2576
+; CHECK: .cfi_endproc
+ ret void
+}
+
+define void @test_mediumframe() {
+; CHECK-LABEL: test_mediumframe:
+ %var1 = alloca i8, i32 1000000
+ %var2 = alloca i8, i32 16
+ %var3 = alloca i8, i32 1000000
+; CHECK: sub sp, sp, #488, lsl #12
+; CHECK-NEXT: sub sp, sp, #1168
+
+ store volatile i8* %var1, i8** @addr
+; CHECK: add [[VAR1ADDR:x[0-9]+]], sp, #244, lsl #12
+; CHECK: add [[VAR1ADDR]], [[VAR1ADDR]], #592
+
+; CHECK: add [[VAR2ADDR:x[0-9]+]], sp, #244, lsl #12
+; CHECK: add [[VAR2ADDR]], [[VAR2ADDR]], #576
+
+ store volatile i8* %var2, i8** @addr
+; CHECK: add sp, sp, #488, lsl #12
+; CHECK: add sp, sp, #1168
+ ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-ld1.ll b/test/CodeGen/AArch64/arm64-ld1.ll
new file mode 100644
index 0000000..72d808c
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-ld1.ll
@@ -0,0 +1,1345 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
+
+%struct.__neon_int8x8x2_t = type { <8 x i8>, <8 x i8> }
+%struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> }
+%struct.__neon_int8x8x4_t = type { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }
+
+define %struct.__neon_int8x8x2_t @ld2_8b(i8* %A) nounwind {
+; CHECK-LABEL: ld2_8b
+; Make sure we are loading into the results defined by the ABI (i.e., v0, v1)
+; and from the argument of the function also defined by ABI (i.e., x0)
+; CHECK ld2.8b { v0, v1 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld2.v8i8.p0i8(i8* %A)
+ ret %struct.__neon_int8x8x2_t %tmp2
+}
+
+define %struct.__neon_int8x8x3_t @ld3_8b(i8* %A) nounwind {
+; CHECK-LABEL: ld3_8b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3.8b { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld3.v8i8.p0i8(i8* %A)
+ ret %struct.__neon_int8x8x3_t %tmp2
+}
+
+define %struct.__neon_int8x8x4_t @ld4_8b(i8* %A) nounwind {
+; CHECK-LABEL: ld4_8b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4.8b { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld4.v8i8.p0i8(i8* %A)
+ ret %struct.__neon_int8x8x4_t %tmp2
+}
+
+declare %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld2.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld3.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld4.v8i8.p0i8(i8*) nounwind readonly
+
+%struct.__neon_int8x16x2_t = type { <16 x i8>, <16 x i8> }
+%struct.__neon_int8x16x3_t = type { <16 x i8>, <16 x i8>, <16 x i8> }
+%struct.__neon_int8x16x4_t = type { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }
+
+define %struct.__neon_int8x16x2_t @ld2_16b(i8* %A) nounwind {
+; CHECK-LABEL: ld2_16b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2.16b { v0, v1 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2.v16i8.p0i8(i8* %A)
+ ret %struct.__neon_int8x16x2_t %tmp2
+}
+
+define %struct.__neon_int8x16x3_t @ld3_16b(i8* %A) nounwind {
+; CHECK-LABEL: ld3_16b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3.16b { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3.v16i8.p0i8(i8* %A)
+ ret %struct.__neon_int8x16x3_t %tmp2
+}
+
+define %struct.__neon_int8x16x4_t @ld4_16b(i8* %A) nounwind {
+; CHECK-LABEL: ld4_16b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4.16b { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4.v16i8.p0i8(i8* %A)
+ ret %struct.__neon_int8x16x4_t %tmp2
+}
+
+declare %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4.v16i8.p0i8(i8*) nounwind readonly
+
+%struct.__neon_int16x4x2_t = type { <4 x i16>, <4 x i16> }
+%struct.__neon_int16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> }
+%struct.__neon_int16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }
+
+define %struct.__neon_int16x4x2_t @ld2_4h(i16* %A) nounwind {
+; CHECK-LABEL: ld2_4h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2.4h { v0, v1 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld2.v4i16.p0i16(i16* %A)
+ ret %struct.__neon_int16x4x2_t %tmp2
+}
+
+define %struct.__neon_int16x4x3_t @ld3_4h(i16* %A) nounwind {
+; CHECK-LABEL: ld3_4h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3.4h { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld3.v4i16.p0i16(i16* %A)
+ ret %struct.__neon_int16x4x3_t %tmp2
+}
+
+define %struct.__neon_int16x4x4_t @ld4_4h(i16* %A) nounwind {
+; CHECK-LABEL: ld4_4h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4.4h { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld4.v4i16.p0i16(i16* %A)
+ ret %struct.__neon_int16x4x4_t %tmp2
+}
+
+declare %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld2.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld3.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld4.v4i16.p0i16(i16*) nounwind readonly
+
+%struct.__neon_int16x8x2_t = type { <8 x i16>, <8 x i16> }
+%struct.__neon_int16x8x3_t = type { <8 x i16>, <8 x i16>, <8 x i16> }
+%struct.__neon_int16x8x4_t = type { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }
+
+define %struct.__neon_int16x8x2_t @ld2_8h(i16* %A) nounwind {
+; CHECK-LABEL: ld2_8h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2.8h { v0, v1 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2.v8i16.p0i16(i16* %A)
+ ret %struct.__neon_int16x8x2_t %tmp2
+}
+
+define %struct.__neon_int16x8x3_t @ld3_8h(i16* %A) nounwind {
+; CHECK-LABEL: ld3_8h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3.8h { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3.v8i16.p0i16(i16* %A)
+ ret %struct.__neon_int16x8x3_t %tmp2
+}
+
+define %struct.__neon_int16x8x4_t @ld4_8h(i16* %A) nounwind {
+; CHECK-LABEL: ld4_8h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4.8h { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4.v8i16.p0i16(i16* %A)
+ ret %struct.__neon_int16x8x4_t %tmp2
+}
+
+declare %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4.v8i16.p0i16(i16*) nounwind readonly
+
+%struct.__neon_int32x2x2_t = type { <2 x i32>, <2 x i32> }
+%struct.__neon_int32x2x3_t = type { <2 x i32>, <2 x i32>, <2 x i32> }
+%struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }
+
+define %struct.__neon_int32x2x2_t @ld2_2s(i32* %A) nounwind {
+; CHECK-LABEL: ld2_2s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2.2s { v0, v1 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld2.v2i32.p0i32(i32* %A)
+ ret %struct.__neon_int32x2x2_t %tmp2
+}
+
+define %struct.__neon_int32x2x3_t @ld3_2s(i32* %A) nounwind {
+; CHECK-LABEL: ld3_2s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3.2s { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld3.v2i32.p0i32(i32* %A)
+ ret %struct.__neon_int32x2x3_t %tmp2
+}
+
+define %struct.__neon_int32x2x4_t @ld4_2s(i32* %A) nounwind {
+; CHECK-LABEL: ld4_2s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4.2s { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld4.v2i32.p0i32(i32* %A)
+ ret %struct.__neon_int32x2x4_t %tmp2
+}
+
+declare %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld2.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld3.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld4.v2i32.p0i32(i32*) nounwind readonly
+
+%struct.__neon_int32x4x2_t = type { <4 x i32>, <4 x i32> }
+%struct.__neon_int32x4x3_t = type { <4 x i32>, <4 x i32>, <4 x i32> }
+%struct.__neon_int32x4x4_t = type { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }
+
+define %struct.__neon_int32x4x2_t @ld2_4s(i32* %A) nounwind {
+; CHECK-LABEL: ld2_4s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2.4s { v0, v1 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2.v4i32.p0i32(i32* %A)
+ ret %struct.__neon_int32x4x2_t %tmp2
+}
+
+define %struct.__neon_int32x4x3_t @ld3_4s(i32* %A) nounwind {
+; CHECK-LABEL: ld3_4s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3.4s { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3.v4i32.p0i32(i32* %A)
+ ret %struct.__neon_int32x4x3_t %tmp2
+}
+
+define %struct.__neon_int32x4x4_t @ld4_4s(i32* %A) nounwind {
+; CHECK-LABEL: ld4_4s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4.4s { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4.v4i32.p0i32(i32* %A)
+ ret %struct.__neon_int32x4x4_t %tmp2
+}
+
+declare %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4.v4i32.p0i32(i32*) nounwind readonly
+
+%struct.__neon_int64x2x2_t = type { <2 x i64>, <2 x i64> }
+%struct.__neon_int64x2x3_t = type { <2 x i64>, <2 x i64>, <2 x i64> }
+%struct.__neon_int64x2x4_t = type { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }
+
+define %struct.__neon_int64x2x2_t @ld2_2d(i64* %A) nounwind {
+; CHECK-LABEL: ld2_2d
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2.2d { v0, v1 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2.v2i64.p0i64(i64* %A)
+ ret %struct.__neon_int64x2x2_t %tmp2
+}
+
+define %struct.__neon_int64x2x3_t @ld3_2d(i64* %A) nounwind {
+; CHECK-LABEL: ld3_2d
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3.2d { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3.v2i64.p0i64(i64* %A)
+ ret %struct.__neon_int64x2x3_t %tmp2
+}
+
+define %struct.__neon_int64x2x4_t @ld4_2d(i64* %A) nounwind {
+; CHECK-LABEL: ld4_2d
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4.2d { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4.v2i64.p0i64(i64* %A)
+ ret %struct.__neon_int64x2x4_t %tmp2
+}
+
+declare %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4.v2i64.p0i64(i64*) nounwind readonly
+
+%struct.__neon_int64x1x2_t = type { <1 x i64>, <1 x i64> }
+%struct.__neon_int64x1x3_t = type { <1 x i64>, <1 x i64>, <1 x i64> }
+%struct.__neon_int64x1x4_t = type { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }
+
+
+define %struct.__neon_int64x1x2_t @ld2_1di64(i64* %A) nounwind {
+; CHECK-LABEL: ld2_1di64
+; Make sure we are using the operands defined by the ABI
+; CHECK ld1.1d { v0, v1 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld2.v1i64.p0i64(i64* %A)
+ ret %struct.__neon_int64x1x2_t %tmp2
+}
+
+define %struct.__neon_int64x1x3_t @ld3_1di64(i64* %A) nounwind {
+; CHECK-LABEL: ld3_1di64
+; Make sure we are using the operands defined by the ABI
+; CHECK ld1.1d { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld3.v1i64.p0i64(i64* %A)
+ ret %struct.__neon_int64x1x3_t %tmp2
+}
+
+define %struct.__neon_int64x1x4_t @ld4_1di64(i64* %A) nounwind {
+; CHECK-LABEL: ld4_1di64
+; Make sure we are using the operands defined by the ABI
+; CHECK ld1.1d { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld4.v1i64.p0i64(i64* %A)
+ ret %struct.__neon_int64x1x4_t %tmp2
+}
+
+
+declare %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld2.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld3.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld4.v1i64.p0i64(i64*) nounwind readonly
+
+%struct.__neon_float64x1x2_t = type { <1 x double>, <1 x double> }
+%struct.__neon_float64x1x3_t = type { <1 x double>, <1 x double>, <1 x double> }
+%struct.__neon_float64x1x4_t = type { <1 x double>, <1 x double>, <1 x double>, <1 x double> }
+
+
+define %struct.__neon_float64x1x2_t @ld2_1df64(double* %A) nounwind {
+; CHECK-LABEL: ld2_1df64
+; Make sure we are using the operands defined by the ABI
+; CHECK ld1.1d { v0, v1 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_float64x1x2_t @llvm.aarch64.neon.ld2.v1f64.p0f64(double* %A)
+ ret %struct.__neon_float64x1x2_t %tmp2
+}
+
+define %struct.__neon_float64x1x3_t @ld3_1df64(double* %A) nounwind {
+; CHECK-LABEL: ld3_1df64
+; Make sure we are using the operands defined by the ABI
+; CHECK ld1.1d { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_float64x1x3_t @llvm.aarch64.neon.ld3.v1f64.p0f64(double* %A)
+ ret %struct.__neon_float64x1x3_t %tmp2
+}
+
+define %struct.__neon_float64x1x4_t @ld4_1df64(double* %A) nounwind {
+; CHECK-LABEL: ld4_1df64
+; Make sure we are using the operands defined by the ABI
+; CHECK ld1.1d { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_float64x1x4_t @llvm.aarch64.neon.ld4.v1f64.p0f64(double* %A)
+ ret %struct.__neon_float64x1x4_t %tmp2
+}
+
+declare %struct.__neon_float64x1x2_t @llvm.aarch64.neon.ld2.v1f64.p0f64(double*) nounwind readonly
+declare %struct.__neon_float64x1x3_t @llvm.aarch64.neon.ld3.v1f64.p0f64(double*) nounwind readonly
+declare %struct.__neon_float64x1x4_t @llvm.aarch64.neon.ld4.v1f64.p0f64(double*) nounwind readonly
+
+
+define %struct.__neon_int8x16x2_t @ld2lane_16b(<16 x i8> %L1, <16 x i8> %L2, i8* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld2lane_16b
+; CHECK ld2.b { v0, v1 }[1], [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8> %L1, <16 x i8> %L2, i64 1, i8* %A)
+ ret %struct.__neon_int8x16x2_t %tmp2
+}
+
+define %struct.__neon_int8x16x3_t @ld3lane_16b(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, i8* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld3lane_16b
+; CHECK ld3.b { v0, v1, v2 }[1], [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, i64 1, i8* %A)
+ ret %struct.__neon_int8x16x3_t %tmp2
+}
+
+define %struct.__neon_int8x16x4_t @ld4lane_16b(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, <16 x i8> %L4, i8* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld4lane_16b
+; CHECK ld4.b { v0, v1, v2, v3 }[1], [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, <16 x i8> %L4, i64 1, i8* %A)
+ ret %struct.__neon_int8x16x4_t %tmp2
+}
+
+declare %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
+declare %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
+declare %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
+
+define %struct.__neon_int16x8x2_t @ld2lane_8h(<8 x i16> %L1, <8 x i16> %L2, i16* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld2lane_8h
+; CHECK ld2.h { v0, v1 }[1], [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2lane.v8i16.p0i16(<8 x i16> %L1, <8 x i16> %L2, i64 1, i16* %A)
+ ret %struct.__neon_int16x8x2_t %tmp2
+}
+
+define %struct.__neon_int16x8x3_t @ld3lane_8h(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, i16* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld3lane_8h
+; CHECK ld3.h { v0, v1, v3 }[1], [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3lane.v8i16.p0i16(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, i64 1, i16* %A)
+ ret %struct.__neon_int16x8x3_t %tmp2
+}
+
+define %struct.__neon_int16x8x4_t @ld4lane_8h(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, <8 x i16> %L4, i16* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld4lane_8h
+; CHECK ld4.h { v0, v1, v2, v3 }[1], [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4lane.v8i16.p0i16(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, <8 x i16> %L4, i64 1, i16* %A)
+ ret %struct.__neon_int16x8x4_t %tmp2
+}
+
+declare %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
+
+define %struct.__neon_int32x4x2_t @ld2lane_4s(<4 x i32> %L1, <4 x i32> %L2, i32* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld2lane_4s
+; CHECK ld2.s { v0, v1 }[1], [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2lane.v4i32.p0i32(<4 x i32> %L1, <4 x i32> %L2, i64 1, i32* %A)
+ ret %struct.__neon_int32x4x2_t %tmp2
+}
+
+define %struct.__neon_int32x4x3_t @ld3lane_4s(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, i32* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld3lane_4s
+; CHECK ld3.s { v0, v1, v2 }[1], [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3lane.v4i32.p0i32(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, i64 1, i32* %A)
+ ret %struct.__neon_int32x4x3_t %tmp2
+}
+
+define %struct.__neon_int32x4x4_t @ld4lane_4s(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, <4 x i32> %L4, i32* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld4lane_4s
+; CHECK ld4.s { v0, v1, v2, v3 }[1], [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4lane.v4i32.p0i32(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, <4 x i32> %L4, i64 1, i32* %A)
+ ret %struct.__neon_int32x4x4_t %tmp2
+}
+
+declare %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
+
+define %struct.__neon_int64x2x2_t @ld2lane_2d(<2 x i64> %L1, <2 x i64> %L2, i64* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld2lane_2d
+; CHECK ld2.d { v0, v1 }[1], [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2lane.v2i64.p0i64(<2 x i64> %L1, <2 x i64> %L2, i64 1, i64* %A)
+ ret %struct.__neon_int64x2x2_t %tmp2
+}
+
+define %struct.__neon_int64x2x3_t @ld3lane_2d(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, i64* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld3lane_2d
+; CHECK ld3.d { v0, v1, v3 }[1], [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3lane.v2i64.p0i64(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, i64 1, i64* %A)
+ ret %struct.__neon_int64x2x3_t %tmp2
+}
+
+define %struct.__neon_int64x2x4_t @ld4lane_2d(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, <2 x i64> %L4, i64* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld4lane_2d
+; CHECK ld4.d { v0, v1, v2, v3 }[1], [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4lane.v2i64.p0i64(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, <2 x i64> %L4, i64 1, i64* %A)
+ ret %struct.__neon_int64x2x4_t %tmp2
+}
+
+declare %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
+declare %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
+declare %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
+
+define <8 x i8> @ld1r_8b(i8* %bar) {
+; CHECK: ld1r_8b
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.8b { v0 }, [x0]
+; CHECK-NEXT ret
+ %tmp1 = load i8* %bar
+ %tmp2 = insertelement <8 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmp1, i32 0
+ %tmp3 = insertelement <8 x i8> %tmp2, i8 %tmp1, i32 1
+ %tmp4 = insertelement <8 x i8> %tmp3, i8 %tmp1, i32 2
+ %tmp5 = insertelement <8 x i8> %tmp4, i8 %tmp1, i32 3
+ %tmp6 = insertelement <8 x i8> %tmp5, i8 %tmp1, i32 4
+ %tmp7 = insertelement <8 x i8> %tmp6, i8 %tmp1, i32 5
+ %tmp8 = insertelement <8 x i8> %tmp7, i8 %tmp1, i32 6
+ %tmp9 = insertelement <8 x i8> %tmp8, i8 %tmp1, i32 7
+ ret <8 x i8> %tmp9
+}
+
+define <16 x i8> @ld1r_16b(i8* %bar) {
+; CHECK: ld1r_16b
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.16b { v0 }, [x0]
+; CHECK-NEXT ret
+ %tmp1 = load i8* %bar
+ %tmp2 = insertelement <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmp1, i32 0
+ %tmp3 = insertelement <16 x i8> %tmp2, i8 %tmp1, i32 1
+ %tmp4 = insertelement <16 x i8> %tmp3, i8 %tmp1, i32 2
+ %tmp5 = insertelement <16 x i8> %tmp4, i8 %tmp1, i32 3
+ %tmp6 = insertelement <16 x i8> %tmp5, i8 %tmp1, i32 4
+ %tmp7 = insertelement <16 x i8> %tmp6, i8 %tmp1, i32 5
+ %tmp8 = insertelement <16 x i8> %tmp7, i8 %tmp1, i32 6
+ %tmp9 = insertelement <16 x i8> %tmp8, i8 %tmp1, i32 7
+ %tmp10 = insertelement <16 x i8> %tmp9, i8 %tmp1, i32 8
+ %tmp11 = insertelement <16 x i8> %tmp10, i8 %tmp1, i32 9
+ %tmp12 = insertelement <16 x i8> %tmp11, i8 %tmp1, i32 10
+ %tmp13 = insertelement <16 x i8> %tmp12, i8 %tmp1, i32 11
+ %tmp14 = insertelement <16 x i8> %tmp13, i8 %tmp1, i32 12
+ %tmp15 = insertelement <16 x i8> %tmp14, i8 %tmp1, i32 13
+ %tmp16 = insertelement <16 x i8> %tmp15, i8 %tmp1, i32 14
+ %tmp17 = insertelement <16 x i8> %tmp16, i8 %tmp1, i32 15
+ ret <16 x i8> %tmp17
+}
+
+define <4 x i16> @ld1r_4h(i16* %bar) {
+; CHECK: ld1r_4h
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.4h { v0 }, [x0]
+; CHECK-NEXT ret
+ %tmp1 = load i16* %bar
+ %tmp2 = insertelement <4 x i16> <i16 undef, i16 undef, i16 undef, i16 undef>, i16 %tmp1, i32 0
+ %tmp3 = insertelement <4 x i16> %tmp2, i16 %tmp1, i32 1
+ %tmp4 = insertelement <4 x i16> %tmp3, i16 %tmp1, i32 2
+ %tmp5 = insertelement <4 x i16> %tmp4, i16 %tmp1, i32 3
+ ret <4 x i16> %tmp5
+}
+
+define <8 x i16> @ld1r_8h(i16* %bar) {
+; CHECK: ld1r_8h
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.8h { v0 }, [x0]
+; CHECK-NEXT ret
+ %tmp1 = load i16* %bar
+ %tmp2 = insertelement <8 x i16> <i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, i16 %tmp1, i32 0
+ %tmp3 = insertelement <8 x i16> %tmp2, i16 %tmp1, i32 1
+ %tmp4 = insertelement <8 x i16> %tmp3, i16 %tmp1, i32 2
+ %tmp5 = insertelement <8 x i16> %tmp4, i16 %tmp1, i32 3
+ %tmp6 = insertelement <8 x i16> %tmp5, i16 %tmp1, i32 4
+ %tmp7 = insertelement <8 x i16> %tmp6, i16 %tmp1, i32 5
+ %tmp8 = insertelement <8 x i16> %tmp7, i16 %tmp1, i32 6
+ %tmp9 = insertelement <8 x i16> %tmp8, i16 %tmp1, i32 7
+ ret <8 x i16> %tmp9
+}
+
+define <2 x i32> @ld1r_2s(i32* %bar) {
+; CHECK: ld1r_2s
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.2s { v0 }, [x0]
+; CHECK-NEXT ret
+ %tmp1 = load i32* %bar
+ %tmp2 = insertelement <2 x i32> <i32 undef, i32 undef>, i32 %tmp1, i32 0
+ %tmp3 = insertelement <2 x i32> %tmp2, i32 %tmp1, i32 1
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @ld1r_4s(i32* %bar) {
+; CHECK: ld1r_4s
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.4s { v0 }, [x0]
+; CHECK-NEXT ret
+ %tmp1 = load i32* %bar
+ %tmp2 = insertelement <4 x i32> <i32 undef, i32 undef, i32 undef, i32 undef>, i32 %tmp1, i32 0
+ %tmp3 = insertelement <4 x i32> %tmp2, i32 %tmp1, i32 1
+ %tmp4 = insertelement <4 x i32> %tmp3, i32 %tmp1, i32 2
+ %tmp5 = insertelement <4 x i32> %tmp4, i32 %tmp1, i32 3
+ ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @ld1r_2d(i64* %bar) {
+; CHECK: ld1r_2d
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.2d { v0 }, [x0]
+; CHECK-NEXT ret
+ %tmp1 = load i64* %bar
+ %tmp2 = insertelement <2 x i64> <i64 undef, i64 undef>, i64 %tmp1, i32 0
+ %tmp3 = insertelement <2 x i64> %tmp2, i64 %tmp1, i32 1
+ ret <2 x i64> %tmp3
+}
+
+define %struct.__neon_int8x8x2_t @ld2r_8b(i8* %A) nounwind {
+; CHECK: ld2r_8b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2r.8b { v0, v1 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %A)
+ ret %struct.__neon_int8x8x2_t %tmp2
+}
+
+define %struct.__neon_int8x8x3_t @ld3r_8b(i8* %A) nounwind {
+; CHECK: ld3r_8b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3r.8b { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %A)
+ ret %struct.__neon_int8x8x3_t %tmp2
+}
+
+define %struct.__neon_int8x8x4_t @ld4r_8b(i8* %A) nounwind {
+; CHECK: ld4r_8b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4r.8b { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %A)
+ ret %struct.__neon_int8x8x4_t %tmp2
+}
+
+declare %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8*) nounwind readonly
+
+define %struct.__neon_int8x16x2_t @ld2r_16b(i8* %A) nounwind {
+; CHECK: ld2r_16b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2r.16b { v0, v1 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8* %A)
+ ret %struct.__neon_int8x16x2_t %tmp2
+}
+
+define %struct.__neon_int8x16x3_t @ld3r_16b(i8* %A) nounwind {
+; CHECK: ld3r_16b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3r.16b { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8* %A)
+ ret %struct.__neon_int8x16x3_t %tmp2
+}
+
+define %struct.__neon_int8x16x4_t @ld4r_16b(i8* %A) nounwind {
+; CHECK: ld4r_16b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4r.16b { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8* %A)
+ ret %struct.__neon_int8x16x4_t %tmp2
+}
+
+declare %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8*) nounwind readonly
+
+define %struct.__neon_int16x4x2_t @ld2r_4h(i16* %A) nounwind {
+; CHECK: ld2r_4h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2r.4h { v0, v1 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* %A)
+ ret %struct.__neon_int16x4x2_t %tmp2
+}
+
+define %struct.__neon_int16x4x3_t @ld3r_4h(i16* %A) nounwind {
+; CHECK: ld3r_4h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3r.4h { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* %A)
+ ret %struct.__neon_int16x4x3_t %tmp2
+}
+
+define %struct.__neon_int16x4x4_t @ld4r_4h(i16* %A) nounwind {
+; CHECK: ld4r_4h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4r.4h { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* %A)
+ ret %struct.__neon_int16x4x4_t %tmp2
+}
+
+declare %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16*) nounwind readonly
+
+define %struct.__neon_int16x8x2_t @ld2r_8h(i16* %A) nounwind {
+; CHECK: ld2r_8h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2r.8h { v0, v1 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16* %A)
+ ret %struct.__neon_int16x8x2_t %tmp2
+}
+
+define %struct.__neon_int16x8x3_t @ld3r_8h(i16* %A) nounwind {
+; CHECK: ld3r_8h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3r.8h { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16* %A)
+ ret %struct.__neon_int16x8x3_t %tmp2
+}
+
+define %struct.__neon_int16x8x4_t @ld4r_8h(i16* %A) nounwind {
+; CHECK: ld4r_8h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4r.8h { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16* %A)
+ ret %struct.__neon_int16x8x4_t %tmp2
+}
+
+declare %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16*) nounwind readonly
+
+define %struct.__neon_int32x2x2_t @ld2r_2s(i32* %A) nounwind {
+; CHECK: ld2r_2s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2r.2s { v0, v1 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32* %A)
+ ret %struct.__neon_int32x2x2_t %tmp2
+}
+
+define %struct.__neon_int32x2x3_t @ld3r_2s(i32* %A) nounwind {
+; CHECK: ld3r_2s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3r.2s { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32* %A)
+ ret %struct.__neon_int32x2x3_t %tmp2
+}
+
+define %struct.__neon_int32x2x4_t @ld4r_2s(i32* %A) nounwind {
+; CHECK: ld4r_2s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4r.2s { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32* %A)
+ ret %struct.__neon_int32x2x4_t %tmp2
+}
+
+declare %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32*) nounwind readonly
+
+define %struct.__neon_int32x4x2_t @ld2r_4s(i32* %A) nounwind {
+; CHECK: ld2r_4s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2r.4s { v0, v1 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2r.v4i32.p0i32(i32* %A)
+ ret %struct.__neon_int32x4x2_t %tmp2
+}
+
+define %struct.__neon_int32x4x3_t @ld3r_4s(i32* %A) nounwind {
+; CHECK: ld3r_4s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3r.4s { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3r.v4i32.p0i32(i32* %A)
+ ret %struct.__neon_int32x4x3_t %tmp2
+}
+
+define %struct.__neon_int32x4x4_t @ld4r_4s(i32* %A) nounwind {
+; CHECK: ld4r_4s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4r.4s { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4r.v4i32.p0i32(i32* %A)
+ ret %struct.__neon_int32x4x4_t %tmp2
+}
+
+declare %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2r.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3r.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4r.v4i32.p0i32(i32*) nounwind readonly
+
+define %struct.__neon_int64x1x2_t @ld2r_1d(i64* %A) nounwind {
+; CHECK: ld2r_1d
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2r.1d { v0, v1 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64* %A)
+ ret %struct.__neon_int64x1x2_t %tmp2
+}
+
+define %struct.__neon_int64x1x3_t @ld3r_1d(i64* %A) nounwind {
+; CHECK: ld3r_1d
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3r.1d { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64* %A)
+ ret %struct.__neon_int64x1x3_t %tmp2
+}
+
+define %struct.__neon_int64x1x4_t @ld4r_1d(i64* %A) nounwind {
+; CHECK: ld4r_1d
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4r.1d { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64* %A)
+ ret %struct.__neon_int64x1x4_t %tmp2
+}
+
+declare %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64*) nounwind readonly
+
+define %struct.__neon_int64x2x2_t @ld2r_2d(i64* %A) nounwind {
+; CHECK: ld2r_2d
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2r.2d { v0, v1 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64* %A)
+ ret %struct.__neon_int64x2x2_t %tmp2
+}
+
+define %struct.__neon_int64x2x3_t @ld3r_2d(i64* %A) nounwind {
+; CHECK: ld3r_2d
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3r.2d { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64* %A)
+ ret %struct.__neon_int64x2x3_t %tmp2
+}
+
+define %struct.__neon_int64x2x4_t @ld4r_2d(i64* %A) nounwind {
+; CHECK: ld4r_2d
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4r.2d { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+ %tmp2 = call %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64* %A)
+ ret %struct.__neon_int64x2x4_t %tmp2
+}
+
+declare %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64*) nounwind readonly
+
+define <16 x i8> @ld1_16b(<16 x i8> %V, i8* %bar) {
+; CHECK-LABEL: ld1_16b
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.b { v0 }[0], [x0]
+; CHECK-NEXT ret
+ %tmp1 = load i8* %bar
+ %tmp2 = insertelement <16 x i8> %V, i8 %tmp1, i32 0
+ ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @ld1_8h(<8 x i16> %V, i16* %bar) {
+; CHECK-LABEL: ld1_8h
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.h { v0 }[0], [x0]
+; CHECK-NEXT ret
+ %tmp1 = load i16* %bar
+ %tmp2 = insertelement <8 x i16> %V, i16 %tmp1, i32 0
+ ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @ld1_4s(<4 x i32> %V, i32* %bar) {
+; CHECK-LABEL: ld1_4s
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.s { v0 }[0], [x0]
+; CHECK-NEXT ret
+ %tmp1 = load i32* %bar
+ %tmp2 = insertelement <4 x i32> %V, i32 %tmp1, i32 0
+ ret <4 x i32> %tmp2
+}
+
+define <4 x float> @ld1_4s_float(<4 x float> %V, float* %bar) {
+; CHECK-LABEL: ld1_4s_float:
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.s { v0 }[0], [x0]
+; CHECK-NEXT ret
+ %tmp1 = load float* %bar
+ %tmp2 = insertelement <4 x float> %V, float %tmp1, i32 0
+ ret <4 x float> %tmp2
+}
+
+define <2 x i64> @ld1_2d(<2 x i64> %V, i64* %bar) {
+; CHECK-LABEL: ld1_2d
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.d { v0 }[0], [x0]
+; CHECK-NEXT ret
+ %tmp1 = load i64* %bar
+ %tmp2 = insertelement <2 x i64> %V, i64 %tmp1, i32 0
+ ret <2 x i64> %tmp2
+}
+
+define <2 x double> @ld1_2d_double(<2 x double> %V, double* %bar) {
+; CHECK-LABEL: ld1_2d_double:
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.d { v0 }[0], [x0]
+; CHECK-NEXT ret
+ %tmp1 = load double* %bar
+ %tmp2 = insertelement <2 x double> %V, double %tmp1, i32 0
+ ret <2 x double> %tmp2
+}
+
+define <1 x i64> @ld1_1d(<1 x i64>* %p) {
+; CHECK-LABEL: ld1_1d
+; Make sure we are using the operands defined by the ABI
+; CHECK: ldr [[REG:d[0-9]+]], [x0]
+; CHECK-NEXT: ret
+ %tmp = load <1 x i64>* %p, align 8
+ ret <1 x i64> %tmp
+}
+
+define <8 x i8> @ld1_8b(<8 x i8> %V, i8* %bar) {
+; CHECK-LABEL: ld1_8b
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.b { v0 }[0], [x0]
+; CHECK-NEXT ret
+ %tmp1 = load i8* %bar
+ %tmp2 = insertelement <8 x i8> %V, i8 %tmp1, i32 0
+ ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @ld1_4h(<4 x i16> %V, i16* %bar) {
+; CHECK-LABEL: ld1_4h
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.h { v0 }[0], [x0]
+; CHECK-NEXT ret
+ %tmp1 = load i16* %bar
+ %tmp2 = insertelement <4 x i16> %V, i16 %tmp1, i32 0
+ ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @ld1_2s(<2 x i32> %V, i32* %bar) {
+; CHECK-LABEL: ld1_2s:
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.s { v0 }[0], [x0]
+; CHECK-NEXT ret
+ %tmp1 = load i32* %bar
+ %tmp2 = insertelement <2 x i32> %V, i32 %tmp1, i32 0
+ ret <2 x i32> %tmp2
+}
+
+define <2 x float> @ld1_2s_float(<2 x float> %V, float* %bar) {
+; CHECK-LABEL: ld1_2s_float:
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.s { v0 }[0], [x0]
+; CHECK-NEXT ret
+ %tmp1 = load float* %bar
+ %tmp2 = insertelement <2 x float> %V, float %tmp1, i32 0
+ ret <2 x float> %tmp2
+}
+
+
+; Add rdar://13098923 test case: vld1_dup_u32 doesn't generate ld1r.2s
+define void @ld1r_2s_from_dup(i8* nocapture %a, i8* nocapture %b, i16* nocapture %diff) nounwind ssp {
+entry:
+; CHECK: ld1r_2s_from_dup
+; CHECK: ld1r.2s { [[ARG1:v[0-9]+]] }, [x0]
+; CHECK-NEXT: ld1r.2s { [[ARG2:v[0-9]+]] }, [x1]
+; CHECK-NEXT: usubl.8h v[[RESREGNUM:[0-9]+]], [[ARG1]], [[ARG2]]
+; CHECK-NEXT: str d[[RESREGNUM]], [x2]
+; CHECK-NEXT: ret
+ %tmp = bitcast i8* %a to i32*
+ %tmp1 = load i32* %tmp, align 4
+ %tmp2 = insertelement <2 x i32> undef, i32 %tmp1, i32 0
+ %lane = shufflevector <2 x i32> %tmp2, <2 x i32> undef, <2 x i32> zeroinitializer
+ %tmp3 = bitcast <2 x i32> %lane to <8 x i8>
+ %tmp4 = bitcast i8* %b to i32*
+ %tmp5 = load i32* %tmp4, align 4
+ %tmp6 = insertelement <2 x i32> undef, i32 %tmp5, i32 0
+ %lane1 = shufflevector <2 x i32> %tmp6, <2 x i32> undef, <2 x i32> zeroinitializer
+ %tmp7 = bitcast <2 x i32> %lane1 to <8 x i8>
+ %vmovl.i.i = zext <8 x i8> %tmp3 to <8 x i16>
+ %vmovl.i4.i = zext <8 x i8> %tmp7 to <8 x i16>
+ %sub.i = sub <8 x i16> %vmovl.i.i, %vmovl.i4.i
+ %tmp8 = bitcast <8 x i16> %sub.i to <2 x i64>
+ %shuffle.i = shufflevector <2 x i64> %tmp8, <2 x i64> undef, <1 x i32> zeroinitializer
+ %tmp9 = bitcast <1 x i64> %shuffle.i to <4 x i16>
+ %tmp10 = bitcast i16* %diff to <4 x i16>*
+ store <4 x i16> %tmp9, <4 x i16>* %tmp10, align 8
+ ret void
+}
+
+; Tests for rdar://11947069: vld1_dup_* and vld1q_dup_* code gen is suboptimal
+define <4 x float> @ld1r_4s_float(float* nocapture %x) {
+entry:
+; CHECK-LABEL: ld1r_4s_float
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.4s { v0 }, [x0]
+; CHECK-NEXT ret
+ %tmp = load float* %x, align 4
+ %tmp1 = insertelement <4 x float> undef, float %tmp, i32 0
+ %tmp2 = insertelement <4 x float> %tmp1, float %tmp, i32 1
+ %tmp3 = insertelement <4 x float> %tmp2, float %tmp, i32 2
+ %tmp4 = insertelement <4 x float> %tmp3, float %tmp, i32 3
+ ret <4 x float> %tmp4
+}
+
+define <2 x float> @ld1r_2s_float(float* nocapture %x) {
+entry:
+; CHECK-LABEL: ld1r_2s_float
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.2s { v0 }, [x0]
+; CHECK-NEXT ret
+ %tmp = load float* %x, align 4
+ %tmp1 = insertelement <2 x float> undef, float %tmp, i32 0
+ %tmp2 = insertelement <2 x float> %tmp1, float %tmp, i32 1
+ ret <2 x float> %tmp2
+}
+
+define <2 x double> @ld1r_2d_double(double* nocapture %x) {
+entry:
+; CHECK-LABEL: ld1r_2d_double
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.2d { v0 }, [x0]
+; CHECK-NEXT ret
+ %tmp = load double* %x, align 4
+ %tmp1 = insertelement <2 x double> undef, double %tmp, i32 0
+ %tmp2 = insertelement <2 x double> %tmp1, double %tmp, i32 1
+ ret <2 x double> %tmp2
+}
+
+define <1 x double> @ld1r_1d_double(double* nocapture %x) {
+entry:
+; CHECK-LABEL: ld1r_1d_double
+; Make sure we are using the operands defined by the ABI
+; CHECK: ldr d0, [x0]
+; CHECK-NEXT ret
+ %tmp = load double* %x, align 4
+ %tmp1 = insertelement <1 x double> undef, double %tmp, i32 0
+ ret <1 x double> %tmp1
+}
+
+define <4 x float> @ld1r_4s_float_shuff(float* nocapture %x) {
+entry:
+; CHECK-LABEL: ld1r_4s_float_shuff
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.4s { v0 }, [x0]
+; CHECK-NEXT ret
+ %tmp = load float* %x, align 4
+ %tmp1 = insertelement <4 x float> undef, float %tmp, i32 0
+ %lane = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer
+ ret <4 x float> %lane
+}
+
+define <2 x float> @ld1r_2s_float_shuff(float* nocapture %x) {
+entry:
+; CHECK-LABEL: ld1r_2s_float_shuff
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.2s { v0 }, [x0]
+; CHECK-NEXT ret
+ %tmp = load float* %x, align 4
+ %tmp1 = insertelement <2 x float> undef, float %tmp, i32 0
+ %lane = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer
+ ret <2 x float> %lane
+}
+
+define <2 x double> @ld1r_2d_double_shuff(double* nocapture %x) {
+entry:
+; CHECK-LABEL: ld1r_2d_double_shuff
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.2d { v0 }, [x0]
+; CHECK-NEXT ret
+ %tmp = load double* %x, align 4
+ %tmp1 = insertelement <2 x double> undef, double %tmp, i32 0
+ %lane = shufflevector <2 x double> %tmp1, <2 x double> undef, <2 x i32> zeroinitializer
+ ret <2 x double> %lane
+}
+
+define <1 x double> @ld1r_1d_double_shuff(double* nocapture %x) {
+entry:
+; CHECK-LABEL: ld1r_1d_double_shuff
+; Make sure we are using the operands defined by the ABI
+; CHECK: ldr d0, [x0]
+; CHECK-NEXT ret
+ %tmp = load double* %x, align 4
+ %tmp1 = insertelement <1 x double> undef, double %tmp, i32 0
+ %lane = shufflevector <1 x double> %tmp1, <1 x double> undef, <1 x i32> zeroinitializer
+ ret <1 x double> %lane
+}
+
+%struct.__neon_float32x2x2_t = type { <2 x float>, <2 x float> }
+%struct.__neon_float32x2x3_t = type { <2 x float>, <2 x float>, <2 x float> }
+%struct.__neon_float32x2x4_t = type { <2 x float>, <2 x float>, <2 x float>, <2 x float> }
+
+declare %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld1x2.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld1x2.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld1x2.v2f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld1x2.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x1x2_t @llvm.aarch64.neon.ld1x2.v1f64.p0f64(double*) nounwind readonly
+
+define %struct.__neon_int8x8x2_t @ld1_x2_v8i8(i8* %addr) {
+; CHECK-LABEL: ld1_x2_v8i8:
+; CHECK: ld1.8b { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ %val = call %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld1x2.v8i8.p0i8(i8* %addr)
+ ret %struct.__neon_int8x8x2_t %val
+}
+
+define %struct.__neon_int16x4x2_t @ld1_x2_v4i16(i16* %addr) {
+; CHECK-LABEL: ld1_x2_v4i16:
+; CHECK: ld1.4h { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ %val = call %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16* %addr)
+ ret %struct.__neon_int16x4x2_t %val
+}
+
+define %struct.__neon_int32x2x2_t @ld1_x2_v2i32(i32* %addr) {
+; CHECK-LABEL: ld1_x2_v2i32:
+; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ %val = call %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld1x2.v2i32.p0i32(i32* %addr)
+ ret %struct.__neon_int32x2x2_t %val
+}
+
+define %struct.__neon_float32x2x2_t @ld1_x2_v2f32(float* %addr) {
+; CHECK-LABEL: ld1_x2_v2f32:
+; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ %val = call %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld1x2.v2f32.p0f32(float* %addr)
+ ret %struct.__neon_float32x2x2_t %val
+}
+
+define %struct.__neon_int64x1x2_t @ld1_x2_v1i64(i64* %addr) {
+; CHECK-LABEL: ld1_x2_v1i64:
+; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ %val = call %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld1x2.v1i64.p0i64(i64* %addr)
+ ret %struct.__neon_int64x1x2_t %val
+}
+
+define %struct.__neon_float64x1x2_t @ld1_x2_v1f64(double* %addr) {
+; CHECK-LABEL: ld1_x2_v1f64:
+; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ %val = call %struct.__neon_float64x1x2_t @llvm.aarch64.neon.ld1x2.v1f64.p0f64(double* %addr)
+ ret %struct.__neon_float64x1x2_t %val
+}
+
+
+%struct.__neon_float32x4x2_t = type { <4 x float>, <4 x float> }
+%struct.__neon_float32x4x3_t = type { <4 x float>, <4 x float>, <4 x float> }
+%struct.__neon_float32x4x4_t = type { <4 x float>, <4 x float>, <4 x float>, <4 x float> }
+
+%struct.__neon_float64x2x2_t = type { <2 x double>, <2 x double> }
+%struct.__neon_float64x2x3_t = type { <2 x double>, <2 x double>, <2 x double> }
+%struct.__neon_float64x2x4_t = type { <2 x double>, <2 x double>, <2 x double>, <2 x double> }
+
+declare %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld1x2.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld1x2.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x4x2_t @llvm.aarch64.neon.ld1x2.v4f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld1x2.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x2x2_t @llvm.aarch64.neon.ld1x2.v2f64.p0f64(double*) nounwind readonly
+
+define %struct.__neon_int8x16x2_t @ld1_x2_v16i8(i8* %addr) {
+; CHECK-LABEL: ld1_x2_v16i8:
+; CHECK: ld1.16b { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ %val = call %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld1x2.v16i8.p0i8(i8* %addr)
+ ret %struct.__neon_int8x16x2_t %val
+}
+
+define %struct.__neon_int16x8x2_t @ld1_x2_v8i16(i16* %addr) {
+; CHECK-LABEL: ld1_x2_v8i16:
+; CHECK: ld1.8h { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ %val = call %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16* %addr)
+ ret %struct.__neon_int16x8x2_t %val
+}
+
+define %struct.__neon_int32x4x2_t @ld1_x2_v4i32(i32* %addr) {
+; CHECK-LABEL: ld1_x2_v4i32:
+; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ %val = call %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld1x2.v4i32.p0i32(i32* %addr)
+ ret %struct.__neon_int32x4x2_t %val
+}
+
+define %struct.__neon_float32x4x2_t @ld1_x2_v4f32(float* %addr) {
+; CHECK-LABEL: ld1_x2_v4f32:
+; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ %val = call %struct.__neon_float32x4x2_t @llvm.aarch64.neon.ld1x2.v4f32.p0f32(float* %addr)
+ ret %struct.__neon_float32x4x2_t %val
+}
+
+define %struct.__neon_int64x2x2_t @ld1_x2_v2i64(i64* %addr) {
+; CHECK-LABEL: ld1_x2_v2i64:
+; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ %val = call %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld1x2.v2i64.p0i64(i64* %addr)
+ ret %struct.__neon_int64x2x2_t %val
+}
+
+define %struct.__neon_float64x2x2_t @ld1_x2_v2f64(double* %addr) {
+; CHECK-LABEL: ld1_x2_v2f64:
+; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ %val = call %struct.__neon_float64x2x2_t @llvm.aarch64.neon.ld1x2.v2f64.p0f64(double* %addr)
+ ret %struct.__neon_float64x2x2_t %val
+}
+
+declare %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld1x3.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld1x3.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld1x3.v2f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld1x3.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x1x3_t @llvm.aarch64.neon.ld1x3.v1f64.p0f64(double*) nounwind readonly
+
+define %struct.__neon_int8x8x3_t @ld1_x3_v8i8(i8* %addr) {
+; CHECK-LABEL: ld1_x3_v8i8:
+; CHECK: ld1.8b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ %val = call %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld1x3.v8i8.p0i8(i8* %addr)
+ ret %struct.__neon_int8x8x3_t %val
+}
+
+define %struct.__neon_int16x4x3_t @ld1_x3_v4i16(i16* %addr) {
+; CHECK-LABEL: ld1_x3_v4i16:
+; CHECK: ld1.4h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ %val = call %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16* %addr)
+ ret %struct.__neon_int16x4x3_t %val
+}
+
+define %struct.__neon_int32x2x3_t @ld1_x3_v2i32(i32* %addr) {
+; CHECK-LABEL: ld1_x3_v2i32:
+; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ %val = call %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld1x3.v2i32.p0i32(i32* %addr)
+ ret %struct.__neon_int32x2x3_t %val
+}
+
+define %struct.__neon_float32x2x3_t @ld1_x3_v2f32(float* %addr) {
+; CHECK-LABEL: ld1_x3_v2f32:
+; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ %val = call %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld1x3.v2f32.p0f32(float* %addr)
+ ret %struct.__neon_float32x2x3_t %val
+}
+
+define %struct.__neon_int64x1x3_t @ld1_x3_v1i64(i64* %addr) {
+; CHECK-LABEL: ld1_x3_v1i64:
+; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ %val = call %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld1x3.v1i64.p0i64(i64* %addr)
+ ret %struct.__neon_int64x1x3_t %val
+}
+
+define %struct.__neon_float64x1x3_t @ld1_x3_v1f64(double* %addr) {
+; CHECK-LABEL: ld1_x3_v1f64:
+; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ %val = call %struct.__neon_float64x1x3_t @llvm.aarch64.neon.ld1x3.v1f64.p0f64(double* %addr)
+ ret %struct.__neon_float64x1x3_t %val
+}
+
+declare %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld1x3.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld1x3.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x4x3_t @llvm.aarch64.neon.ld1x3.v4f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld1x3.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x2x3_t @llvm.aarch64.neon.ld1x3.v2f64.p0f64(double*) nounwind readonly
+
+define %struct.__neon_int8x16x3_t @ld1_x3_v16i8(i8* %addr) {
+; CHECK-LABEL: ld1_x3_v16i8:
+; CHECK: ld1.16b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ %val = call %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld1x3.v16i8.p0i8(i8* %addr)
+ ret %struct.__neon_int8x16x3_t %val
+}
+
+define %struct.__neon_int16x8x3_t @ld1_x3_v8i16(i16* %addr) {
+; CHECK-LABEL: ld1_x3_v8i16:
+; CHECK: ld1.8h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ %val = call %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16* %addr)
+ ret %struct.__neon_int16x8x3_t %val
+}
+
+define %struct.__neon_int32x4x3_t @ld1_x3_v4i32(i32* %addr) {
+; CHECK-LABEL: ld1_x3_v4i32:
+; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ %val = call %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld1x3.v4i32.p0i32(i32* %addr)
+ ret %struct.__neon_int32x4x3_t %val
+}
+
+define %struct.__neon_float32x4x3_t @ld1_x3_v4f32(float* %addr) {
+; CHECK-LABEL: ld1_x3_v4f32:
+; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ %val = call %struct.__neon_float32x4x3_t @llvm.aarch64.neon.ld1x3.v4f32.p0f32(float* %addr)
+ ret %struct.__neon_float32x4x3_t %val
+}
+
+define %struct.__neon_int64x2x3_t @ld1_x3_v2i64(i64* %addr) {
+; CHECK-LABEL: ld1_x3_v2i64:
+; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ %val = call %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld1x3.v2i64.p0i64(i64* %addr)
+ ret %struct.__neon_int64x2x3_t %val
+}
+
+define %struct.__neon_float64x2x3_t @ld1_x3_v2f64(double* %addr) {
+; CHECK-LABEL: ld1_x3_v2f64:
+; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ %val = call %struct.__neon_float64x2x3_t @llvm.aarch64.neon.ld1x3.v2f64.p0f64(double* %addr)
+ ret %struct.__neon_float64x2x3_t %val
+}
+
+declare %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld1x4.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld1x4.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld1x4.v2f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld1x4.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x1x4_t @llvm.aarch64.neon.ld1x4.v1f64.p0f64(double*) nounwind readonly
+
+define %struct.__neon_int8x8x4_t @ld1_x4_v8i8(i8* %addr) {
+; CHECK-LABEL: ld1_x4_v8i8:
+; CHECK: ld1.8b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ %val = call %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld1x4.v8i8.p0i8(i8* %addr)
+ ret %struct.__neon_int8x8x4_t %val
+}
+
+define %struct.__neon_int16x4x4_t @ld1_x4_v4i16(i16* %addr) {
+; CHECK-LABEL: ld1_x4_v4i16:
+; CHECK: ld1.4h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ %val = call %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16* %addr)
+ ret %struct.__neon_int16x4x4_t %val
+}
+
+define %struct.__neon_int32x2x4_t @ld1_x4_v2i32(i32* %addr) {
+; CHECK-LABEL: ld1_x4_v2i32:
+; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ %val = call %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld1x4.v2i32.p0i32(i32* %addr)
+ ret %struct.__neon_int32x2x4_t %val
+}
+
+define %struct.__neon_float32x2x4_t @ld1_x4_v2f32(float* %addr) {
+; CHECK-LABEL: ld1_x4_v2f32:
+; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ %val = call %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld1x4.v2f32.p0f32(float* %addr)
+ ret %struct.__neon_float32x2x4_t %val
+}
+
+define %struct.__neon_int64x1x4_t @ld1_x4_v1i64(i64* %addr) {
+; CHECK-LABEL: ld1_x4_v1i64:
+; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ %val = call %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld1x4.v1i64.p0i64(i64* %addr)
+ ret %struct.__neon_int64x1x4_t %val
+}
+
+define %struct.__neon_float64x1x4_t @ld1_x4_v1f64(double* %addr) {
+; CHECK-LABEL: ld1_x4_v1f64:
+; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ %val = call %struct.__neon_float64x1x4_t @llvm.aarch64.neon.ld1x4.v1f64.p0f64(double* %addr)
+ ret %struct.__neon_float64x1x4_t %val
+}
+
+declare %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld1x4.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld1x4.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x4x4_t @llvm.aarch64.neon.ld1x4.v4f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld1x4.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x2x4_t @llvm.aarch64.neon.ld1x4.v2f64.p0f64(double*) nounwind readonly
+
+define %struct.__neon_int8x16x4_t @ld1_x4_v16i8(i8* %addr) {
+; CHECK-LABEL: ld1_x4_v16i8:
+; CHECK: ld1.16b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ %val = call %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld1x4.v16i8.p0i8(i8* %addr)
+ ret %struct.__neon_int8x16x4_t %val
+}
+
+define %struct.__neon_int16x8x4_t @ld1_x4_v8i16(i16* %addr) {
+; CHECK-LABEL: ld1_x4_v8i16:
+; CHECK: ld1.8h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ %val = call %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16* %addr)
+ ret %struct.__neon_int16x8x4_t %val
+}
+
+define %struct.__neon_int32x4x4_t @ld1_x4_v4i32(i32* %addr) {
+; CHECK-LABEL: ld1_x4_v4i32:
+; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ %val = call %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld1x4.v4i32.p0i32(i32* %addr)
+ ret %struct.__neon_int32x4x4_t %val
+}
+
+define %struct.__neon_float32x4x4_t @ld1_x4_v4f32(float* %addr) {
+; CHECK-LABEL: ld1_x4_v4f32:
+; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ %val = call %struct.__neon_float32x4x4_t @llvm.aarch64.neon.ld1x4.v4f32.p0f32(float* %addr)
+ ret %struct.__neon_float32x4x4_t %val
+}
+
+define %struct.__neon_int64x2x4_t @ld1_x4_v2i64(i64* %addr) {
+; CHECK-LABEL: ld1_x4_v2i64:
+; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ %val = call %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld1x4.v2i64.p0i64(i64* %addr)
+ ret %struct.__neon_int64x2x4_t %val
+}
+
+define %struct.__neon_float64x2x4_t @ld1_x4_v2f64(double* %addr) {
+; CHECK-LABEL: ld1_x4_v2f64:
+; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ %val = call %struct.__neon_float64x2x4_t @llvm.aarch64.neon.ld1x4.v2f64.p0f64(double* %addr)
+ ret %struct.__neon_float64x2x4_t %val
+}
diff --git a/test/CodeGen/AArch64/arm64-ldp.ll b/test/CodeGen/AArch64/arm64-ldp.ll
new file mode 100644
index 0000000..5a98626
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-ldp.ll
@@ -0,0 +1,149 @@
+; RUN: llc < %s -march=arm64 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-unscaled-mem-op=true\
+; RUN: -verify-machineinstrs | FileCheck -check-prefix=LDUR_CHK %s
+
+; CHECK: ldp_int
+; CHECK: ldp
+define i32 @ldp_int(i32* %p) nounwind {
+ %tmp = load i32* %p, align 4
+ %add.ptr = getelementptr inbounds i32* %p, i64 1
+ %tmp1 = load i32* %add.ptr, align 4
+ %add = add nsw i32 %tmp1, %tmp
+ ret i32 %add
+}
+
+; CHECK: ldp_long
+; CHECK: ldp
+define i64 @ldp_long(i64* %p) nounwind {
+ %tmp = load i64* %p, align 8
+ %add.ptr = getelementptr inbounds i64* %p, i64 1
+ %tmp1 = load i64* %add.ptr, align 8
+ %add = add nsw i64 %tmp1, %tmp
+ ret i64 %add
+}
+
+; CHECK: ldp_float
+; CHECK: ldp
+define float @ldp_float(float* %p) nounwind {
+ %tmp = load float* %p, align 4
+ %add.ptr = getelementptr inbounds float* %p, i64 1
+ %tmp1 = load float* %add.ptr, align 4
+ %add = fadd float %tmp, %tmp1
+ ret float %add
+}
+
+; CHECK: ldp_double
+; CHECK: ldp
+define double @ldp_double(double* %p) nounwind {
+ %tmp = load double* %p, align 8
+ %add.ptr = getelementptr inbounds double* %p, i64 1
+ %tmp1 = load double* %add.ptr, align 8
+ %add = fadd double %tmp, %tmp1
+ ret double %add
+}
+
+; Test the load/store optimizer---combine ldurs into a ldp, if appropriate
+define i32 @ldur_int(i32* %a) nounwind {
+; LDUR_CHK: ldur_int
+; LDUR_CHK: ldp [[DST1:w[0-9]+]], [[DST2:w[0-9]+]], [x0, #-8]
+; LDUR_CHK-NEXT: add w{{[0-9]+}}, [[DST2]], [[DST1]]
+; LDUR_CHK-NEXT: ret
+ %p1 = getelementptr inbounds i32* %a, i32 -1
+ %tmp1 = load i32* %p1, align 2
+ %p2 = getelementptr inbounds i32* %a, i32 -2
+ %tmp2 = load i32* %p2, align 2
+ %tmp3 = add i32 %tmp1, %tmp2
+ ret i32 %tmp3
+}
+
+define i64 @ldur_long(i64* %a) nounwind ssp {
+; LDUR_CHK: ldur_long
+; LDUR_CHK: ldp [[DST1:x[0-9]+]], [[DST2:x[0-9]+]], [x0, #-16]
+; LDUR_CHK-NEXT: add x{{[0-9]+}}, [[DST2]], [[DST1]]
+; LDUR_CHK-NEXT: ret
+ %p1 = getelementptr inbounds i64* %a, i64 -1
+ %tmp1 = load i64* %p1, align 2
+ %p2 = getelementptr inbounds i64* %a, i64 -2
+ %tmp2 = load i64* %p2, align 2
+ %tmp3 = add i64 %tmp1, %tmp2
+ ret i64 %tmp3
+}
+
+define float @ldur_float(float* %a) {
+; LDUR_CHK: ldur_float
+; LDUR_CHK: ldp [[DST1:s[0-9]+]], [[DST2:s[0-9]+]], [x0, #-8]
+; LDUR_CHK-NEXT: add s{{[0-9]+}}, [[DST2]], [[DST1]]
+; LDUR_CHK-NEXT: ret
+ %p1 = getelementptr inbounds float* %a, i64 -1
+ %tmp1 = load float* %p1, align 2
+ %p2 = getelementptr inbounds float* %a, i64 -2
+ %tmp2 = load float* %p2, align 2
+ %tmp3 = fadd float %tmp1, %tmp2
+ ret float %tmp3
+}
+
+define double @ldur_double(double* %a) {
+; LDUR_CHK: ldur_double
+; LDUR_CHK: ldp [[DST1:d[0-9]+]], [[DST2:d[0-9]+]], [x0, #-16]
+; LDUR_CHK-NEXT: add d{{[0-9]+}}, [[DST2]], [[DST1]]
+; LDUR_CHK-NEXT: ret
+ %p1 = getelementptr inbounds double* %a, i64 -1
+ %tmp1 = load double* %p1, align 2
+ %p2 = getelementptr inbounds double* %a, i64 -2
+ %tmp2 = load double* %p2, align 2
+ %tmp3 = fadd double %tmp1, %tmp2
+ ret double %tmp3
+}
+
+; Now check some boundary conditions
+define i64 @pairUpBarelyIn(i64* %a) nounwind ssp {
+; LDUR_CHK: pairUpBarelyIn
+; LDUR_CHK-NOT: ldur
+; LDUR_CHK: ldp [[DST1:x[0-9]+]], [[DST2:x[0-9]+]], [x0, #-256]
+; LDUR_CHK-NEXT: add x{{[0-9]+}}, [[DST2]], [[DST1]]
+; LDUR_CHK-NEXT: ret
+ %p1 = getelementptr inbounds i64* %a, i64 -31
+ %tmp1 = load i64* %p1, align 2
+ %p2 = getelementptr inbounds i64* %a, i64 -32
+ %tmp2 = load i64* %p2, align 2
+ %tmp3 = add i64 %tmp1, %tmp2
+ ret i64 %tmp3
+}
+
+define i64 @pairUpBarelyOut(i64* %a) nounwind ssp {
+; LDUR_CHK: pairUpBarelyOut
+; LDUR_CHK-NOT: ldp
+; Don't be fragile about which loads or manipulations of the base register
+; are used---just check that there isn't an ldp before the add
+; LDUR_CHK: add
+; LDUR_CHK-NEXT: ret
+ %p1 = getelementptr inbounds i64* %a, i64 -32
+ %tmp1 = load i64* %p1, align 2
+ %p2 = getelementptr inbounds i64* %a, i64 -33
+ %tmp2 = load i64* %p2, align 2
+ %tmp3 = add i64 %tmp1, %tmp2
+ ret i64 %tmp3
+}
+
+define i64 @pairUpNotAligned(i64* %a) nounwind ssp {
+; LDUR_CHK: pairUpNotAligned
+; LDUR_CHK-NOT: ldp
+; LDUR_CHK: ldur
+; LDUR_CHK-NEXT: ldur
+; LDUR_CHK-NEXT: add
+; LDUR_CHK-NEXT: ret
+ %p1 = getelementptr inbounds i64* %a, i64 -18
+ %bp1 = bitcast i64* %p1 to i8*
+ %bp1p1 = getelementptr inbounds i8* %bp1, i64 1
+ %dp1 = bitcast i8* %bp1p1 to i64*
+ %tmp1 = load i64* %dp1, align 1
+
+ %p2 = getelementptr inbounds i64* %a, i64 -17
+ %bp2 = bitcast i64* %p2 to i8*
+ %bp2p1 = getelementptr inbounds i8* %bp2, i64 1
+ %dp2 = bitcast i8* %bp2p1 to i64*
+ %tmp2 = load i64* %dp2, align 1
+
+ %tmp3 = add i64 %tmp1, %tmp2
+ ret i64 %tmp3
+}
diff --git a/test/CodeGen/AArch64/arm64-ldur.ll b/test/CodeGen/AArch64/arm64-ldur.ll
new file mode 100644
index 0000000..2848c06
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-ldur.ll
@@ -0,0 +1,67 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define i64 @_f0(i64* %p) {
+; CHECK: f0:
+; CHECK: ldur x0, [x0, #-8]
+; CHECK-NEXT: ret
+ %tmp = getelementptr inbounds i64* %p, i64 -1
+ %ret = load i64* %tmp, align 2
+ ret i64 %ret
+}
+define i32 @_f1(i32* %p) {
+; CHECK: f1:
+; CHECK: ldur w0, [x0, #-4]
+; CHECK-NEXT: ret
+ %tmp = getelementptr inbounds i32* %p, i64 -1
+ %ret = load i32* %tmp, align 2
+ ret i32 %ret
+}
+define i16 @_f2(i16* %p) {
+; CHECK: f2:
+; CHECK: ldurh w0, [x0, #-2]
+; CHECK-NEXT: ret
+ %tmp = getelementptr inbounds i16* %p, i64 -1
+ %ret = load i16* %tmp, align 2
+ ret i16 %ret
+}
+define i8 @_f3(i8* %p) {
+; CHECK: f3:
+; CHECK: ldurb w0, [x0, #-1]
+; CHECK-NEXT: ret
+ %tmp = getelementptr inbounds i8* %p, i64 -1
+ %ret = load i8* %tmp, align 2
+ ret i8 %ret
+}
+
+define i64 @zext32(i8* %a) nounwind ssp {
+; CHECK-LABEL: zext32:
+; CHECK: ldur w0, [x0, #-12]
+; CHECK-NEXT: ret
+ %p = getelementptr inbounds i8* %a, i64 -12
+ %tmp1 = bitcast i8* %p to i32*
+ %tmp2 = load i32* %tmp1, align 4
+ %ret = zext i32 %tmp2 to i64
+
+ ret i64 %ret
+}
+define i64 @zext16(i8* %a) nounwind ssp {
+; CHECK-LABEL: zext16:
+; CHECK: ldurh w0, [x0, #-12]
+; CHECK-NEXT: ret
+ %p = getelementptr inbounds i8* %a, i64 -12
+ %tmp1 = bitcast i8* %p to i16*
+ %tmp2 = load i16* %tmp1, align 2
+ %ret = zext i16 %tmp2 to i64
+
+ ret i64 %ret
+}
+define i64 @zext8(i8* %a) nounwind ssp {
+; CHECK-LABEL: zext8:
+; CHECK: ldurb w0, [x0, #-12]
+; CHECK-NEXT: ret
+ %p = getelementptr inbounds i8* %a, i64 -12
+ %tmp2 = load i8* %p, align 1
+ %ret = zext i8 %tmp2 to i64
+
+ ret i64 %ret
+}
diff --git a/test/CodeGen/AArch64/arm64-ldxr-stxr.ll b/test/CodeGen/AArch64/arm64-ldxr-stxr.ll
new file mode 100644
index 0000000..9093df2
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-ldxr-stxr.ll
@@ -0,0 +1,270 @@
+; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s
+
+%0 = type { i64, i64 }
+
+define i128 @f0(i8* %p) nounwind readonly {
+; CHECK-LABEL: f0:
+; CHECK: ldxp {{x[0-9]+}}, {{x[0-9]+}}, [x0]
+entry:
+ %ldrexd = tail call %0 @llvm.aarch64.ldxp(i8* %p)
+ %0 = extractvalue %0 %ldrexd, 1
+ %1 = extractvalue %0 %ldrexd, 0
+ %2 = zext i64 %0 to i128
+ %3 = zext i64 %1 to i128
+ %shl = shl nuw i128 %2, 64
+ %4 = or i128 %shl, %3
+ ret i128 %4
+}
+
+define i32 @f1(i8* %ptr, i128 %val) nounwind {
+; CHECK-LABEL: f1:
+; CHECK: stxp {{w[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, [x0]
+entry:
+ %tmp4 = trunc i128 %val to i64
+ %tmp6 = lshr i128 %val, 64
+ %tmp7 = trunc i128 %tmp6 to i64
+ %strexd = tail call i32 @llvm.aarch64.stxp(i64 %tmp4, i64 %tmp7, i8* %ptr)
+ ret i32 %strexd
+}
+
+declare %0 @llvm.aarch64.ldxp(i8*) nounwind
+declare i32 @llvm.aarch64.stxp(i64, i64, i8*) nounwind
+
+@var = global i64 0, align 8
+
+define void @test_load_i8(i8* %addr) {
+; CHECK-LABEL: test_load_i8:
+; CHECK: ldxrb w[[LOADVAL:[0-9]+]], [x0]
+; CHECK-NOT: uxtb
+; CHECK-NOT: and
+; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
+
+ %val = call i64 @llvm.aarch64.ldxr.p0i8(i8* %addr)
+ %shortval = trunc i64 %val to i8
+ %extval = zext i8 %shortval to i64
+ store i64 %extval, i64* @var, align 8
+ ret void
+}
+
+define void @test_load_i16(i16* %addr) {
+; CHECK-LABEL: test_load_i16:
+; CHECK: ldxrh w[[LOADVAL:[0-9]+]], [x0]
+; CHECK-NOT: uxth
+; CHECK-NOT: and
+; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
+
+ %val = call i64 @llvm.aarch64.ldxr.p0i16(i16* %addr)
+ %shortval = trunc i64 %val to i16
+ %extval = zext i16 %shortval to i64
+ store i64 %extval, i64* @var, align 8
+ ret void
+}
+
+define void @test_load_i32(i32* %addr) {
+; CHECK-LABEL: test_load_i32:
+; CHECK: ldxr w[[LOADVAL:[0-9]+]], [x0]
+; CHECK-NOT: uxtw
+; CHECK-NOT: and
+; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
+
+ %val = call i64 @llvm.aarch64.ldxr.p0i32(i32* %addr)
+ %shortval = trunc i64 %val to i32
+ %extval = zext i32 %shortval to i64
+ store i64 %extval, i64* @var, align 8
+ ret void
+}
+
+define void @test_load_i64(i64* %addr) {
+; CHECK-LABEL: test_load_i64:
+; CHECK: ldxr x[[LOADVAL:[0-9]+]], [x0]
+; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
+
+ %val = call i64 @llvm.aarch64.ldxr.p0i64(i64* %addr)
+ store i64 %val, i64* @var, align 8
+ ret void
+}
+
+
+declare i64 @llvm.aarch64.ldxr.p0i8(i8*) nounwind
+declare i64 @llvm.aarch64.ldxr.p0i16(i16*) nounwind
+declare i64 @llvm.aarch64.ldxr.p0i32(i32*) nounwind
+declare i64 @llvm.aarch64.ldxr.p0i64(i64*) nounwind
+
+define i32 @test_store_i8(i32, i8 %val, i8* %addr) {
+; CHECK-LABEL: test_store_i8:
+; CHECK-NOT: uxtb
+; CHECK-NOT: and
+; CHECK: stxrb w0, w1, [x2]
+ %extval = zext i8 %val to i64
+ %res = call i32 @llvm.aarch64.stxr.p0i8(i64 %extval, i8* %addr)
+ ret i32 %res
+}
+
+define i32 @test_store_i16(i32, i16 %val, i16* %addr) {
+; CHECK-LABEL: test_store_i16:
+; CHECK-NOT: uxth
+; CHECK-NOT: and
+; CHECK: stxrh w0, w1, [x2]
+ %extval = zext i16 %val to i64
+ %res = call i32 @llvm.aarch64.stxr.p0i16(i64 %extval, i16* %addr)
+ ret i32 %res
+}
+
+define i32 @test_store_i32(i32, i32 %val, i32* %addr) {
+; CHECK-LABEL: test_store_i32:
+; CHECK-NOT: uxtw
+; CHECK-NOT: and
+; CHECK: stxr w0, w1, [x2]
+ %extval = zext i32 %val to i64
+ %res = call i32 @llvm.aarch64.stxr.p0i32(i64 %extval, i32* %addr)
+ ret i32 %res
+}
+
+define i32 @test_store_i64(i32, i64 %val, i64* %addr) {
+; CHECK-LABEL: test_store_i64:
+; CHECK: stxr w0, x1, [x2]
+ %res = call i32 @llvm.aarch64.stxr.p0i64(i64 %val, i64* %addr)
+ ret i32 %res
+}
+
+declare i32 @llvm.aarch64.stxr.p0i8(i64, i8*) nounwind
+declare i32 @llvm.aarch64.stxr.p0i16(i64, i16*) nounwind
+declare i32 @llvm.aarch64.stxr.p0i32(i64, i32*) nounwind
+declare i32 @llvm.aarch64.stxr.p0i64(i64, i64*) nounwind
+
+; CHECK: test_clear:
+; CHECK: clrex
+define void @test_clear() {
+ call void @llvm.aarch64.clrex()
+ ret void
+}
+
+declare void @llvm.aarch64.clrex() nounwind
+
+define i128 @test_load_acquire_i128(i8* %p) nounwind readonly {
+; CHECK-LABEL: test_load_acquire_i128:
+; CHECK: ldaxp {{x[0-9]+}}, {{x[0-9]+}}, [x0]
+entry:
+ %ldrexd = tail call %0 @llvm.aarch64.ldaxp(i8* %p)
+ %0 = extractvalue %0 %ldrexd, 1
+ %1 = extractvalue %0 %ldrexd, 0
+ %2 = zext i64 %0 to i128
+ %3 = zext i64 %1 to i128
+ %shl = shl nuw i128 %2, 64
+ %4 = or i128 %shl, %3
+ ret i128 %4
+}
+
+define i32 @test_store_release_i128(i8* %ptr, i128 %val) nounwind {
+; CHECK-LABEL: test_store_release_i128:
+; CHECK: stlxp {{w[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, [x0]
+entry:
+ %tmp4 = trunc i128 %val to i64
+ %tmp6 = lshr i128 %val, 64
+ %tmp7 = trunc i128 %tmp6 to i64
+ %strexd = tail call i32 @llvm.aarch64.stlxp(i64 %tmp4, i64 %tmp7, i8* %ptr)
+ ret i32 %strexd
+}
+
+declare %0 @llvm.aarch64.ldaxp(i8*) nounwind
+declare i32 @llvm.aarch64.stlxp(i64, i64, i8*) nounwind
+
+define void @test_load_acquire_i8(i8* %addr) {
+; CHECK-LABEL: test_load_acquire_i8:
+; CHECK: ldaxrb w[[LOADVAL:[0-9]+]], [x0]
+; CHECK-NOT: uxtb
+; CHECK-NOT: and
+; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
+
+ %val = call i64 @llvm.aarch64.ldaxr.p0i8(i8* %addr)
+ %shortval = trunc i64 %val to i8
+ %extval = zext i8 %shortval to i64
+ store i64 %extval, i64* @var, align 8
+ ret void
+}
+
+define void @test_load_acquire_i16(i16* %addr) {
+; CHECK-LABEL: test_load_acquire_i16:
+; CHECK: ldaxrh w[[LOADVAL:[0-9]+]], [x0]
+; CHECK-NOT: uxth
+; CHECK-NOT: and
+; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
+
+ %val = call i64 @llvm.aarch64.ldaxr.p0i16(i16* %addr)
+ %shortval = trunc i64 %val to i16
+ %extval = zext i16 %shortval to i64
+ store i64 %extval, i64* @var, align 8
+ ret void
+}
+
+define void @test_load_acquire_i32(i32* %addr) {
+; CHECK-LABEL: test_load_acquire_i32:
+; CHECK: ldaxr w[[LOADVAL:[0-9]+]], [x0]
+; CHECK-NOT: uxtw
+; CHECK-NOT: and
+; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
+
+ %val = call i64 @llvm.aarch64.ldaxr.p0i32(i32* %addr)
+ %shortval = trunc i64 %val to i32
+ %extval = zext i32 %shortval to i64
+ store i64 %extval, i64* @var, align 8
+ ret void
+}
+
+define void @test_load_acquire_i64(i64* %addr) {
+; CHECK-LABEL: test_load_acquire_i64:
+; CHECK: ldaxr x[[LOADVAL:[0-9]+]], [x0]
+; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
+
+ %val = call i64 @llvm.aarch64.ldaxr.p0i64(i64* %addr)
+ store i64 %val, i64* @var, align 8
+ ret void
+}
+
+
+declare i64 @llvm.aarch64.ldaxr.p0i8(i8*) nounwind
+declare i64 @llvm.aarch64.ldaxr.p0i16(i16*) nounwind
+declare i64 @llvm.aarch64.ldaxr.p0i32(i32*) nounwind
+declare i64 @llvm.aarch64.ldaxr.p0i64(i64*) nounwind
+
+define i32 @test_store_release_i8(i32, i8 %val, i8* %addr) {
+; CHECK-LABEL: test_store_release_i8:
+; CHECK-NOT: uxtb
+; CHECK-NOT: and
+; CHECK: stlxrb w0, w1, [x2]
+ %extval = zext i8 %val to i64
+ %res = call i32 @llvm.aarch64.stlxr.p0i8(i64 %extval, i8* %addr)
+ ret i32 %res
+}
+
+define i32 @test_store_release_i16(i32, i16 %val, i16* %addr) {
+; CHECK-LABEL: test_store_release_i16:
+; CHECK-NOT: uxth
+; CHECK-NOT: and
+; CHECK: stlxrh w0, w1, [x2]
+ %extval = zext i16 %val to i64
+ %res = call i32 @llvm.aarch64.stlxr.p0i16(i64 %extval, i16* %addr)
+ ret i32 %res
+}
+
+define i32 @test_store_release_i32(i32, i32 %val, i32* %addr) {
+; CHECK-LABEL: test_store_release_i32:
+; CHECK-NOT: uxtw
+; CHECK-NOT: and
+; CHECK: stlxr w0, w1, [x2]
+ %extval = zext i32 %val to i64
+ %res = call i32 @llvm.aarch64.stlxr.p0i32(i64 %extval, i32* %addr)
+ ret i32 %res
+}
+
+define i32 @test_store_release_i64(i32, i64 %val, i64* %addr) {
+; CHECK-LABEL: test_store_release_i64:
+; CHECK: stlxr w0, x1, [x2]
+ %res = call i32 @llvm.aarch64.stlxr.p0i64(i64 %val, i64* %addr)
+ ret i32 %res
+}
+
+declare i32 @llvm.aarch64.stlxr.p0i8(i64, i8*) nounwind
+declare i32 @llvm.aarch64.stlxr.p0i16(i64, i16*) nounwind
+declare i32 @llvm.aarch64.stlxr.p0i32(i64, i32*) nounwind
+declare i32 @llvm.aarch64.stlxr.p0i64(i64, i64*) nounwind
diff --git a/test/CodeGen/AArch64/arm64-leaf.ll b/test/CodeGen/AArch64/arm64-leaf.ll
new file mode 100644
index 0000000..d3b2031
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-leaf.ll
@@ -0,0 +1,13 @@
+; RUN: llc -march=arm64 -mtriple=arm64-apple-ios < %s | FileCheck %s
+; rdar://12829704
+
+define void @t8() nounwind ssp {
+; CHECK-LABEL: t8:
+; CHECK-NOT: stp fp, lr, [sp, #-16]!
+; CHECK-NOT: mov fp, sp
+; CHECK: nop
+; CHECK-NOT: mov sp, fp
+; CHECK-NOT: ldp fp, lr, [sp], #16
+ tail call void asm sideeffect "nop", "~{v8}"() nounwind
+ ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-long-shift.ll b/test/CodeGen/AArch64/arm64-long-shift.ll
new file mode 100644
index 0000000..d5baf16
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-long-shift.ll
@@ -0,0 +1,59 @@
+; RUN: llc < %s -march=arm64 -mcpu=cyclone | FileCheck %s
+
+define i128 @shl(i128 %r, i128 %s) nounwind readnone {
+; CHECK-LABEL: shl:
+; CHECK: lsl [[XREG_0:x[0-9]+]], x1, x2
+; CHECK-NEXT: orr w[[XREG_1:[0-9]+]], wzr, #0x40
+; CHECK-NEXT: sub [[XREG_2:x[0-9]+]], x[[XREG_1]], x2
+; CHECK-NEXT: lsr [[XREG_3:x[0-9]+]], x0, [[XREG_2]]
+; CHECK-NEXT: orr [[XREG_6:x[0-9]+]], [[XREG_3]], [[XREG_0]]
+; CHECK-NEXT: sub [[XREG_4:x[0-9]+]], x2, #64
+; CHECK-NEXT: lsl [[XREG_5:x[0-9]+]], x0, [[XREG_4]]
+; CHECK-NEXT: cmp [[XREG_4]], #0
+; CHECK-NEXT: csel x1, [[XREG_5]], [[XREG_6]], ge
+; CHECK-NEXT: lsl [[SMALLSHIFT_LO:x[0-9]+]], x0, x2
+; CHECK-NEXT: csel x0, xzr, [[SMALLSHIFT_LO]], ge
+; CHECK-NEXT: ret
+
+ %shl = shl i128 %r, %s
+ ret i128 %shl
+}
+
+define i128 @ashr(i128 %r, i128 %s) nounwind readnone {
+; CHECK-LABEL: ashr:
+; CHECK: lsr [[XREG_0:x[0-9]+]], x0, x2
+; CHECK-NEXT: orr w[[XREG_1:[0-9]+]], wzr, #0x40
+; CHECK-NEXT: sub [[XREG_2:x[0-9]+]], x[[XREG_1]], x2
+; CHECK-NEXT: lsl [[XREG_3:x[0-9]+]], x1, [[XREG_2]]
+; CHECK-NEXT: orr [[XREG_4:x[0-9]+]], [[XREG_0]], [[XREG_3]]
+; CHECK-NEXT: sub [[XREG_5:x[0-9]+]], x2, #64
+; CHECK-NEXT: asr [[XREG_6:x[0-9]+]], x1, [[XREG_5]]
+; CHECK-NEXT: cmp [[XREG_5]], #0
+; CHECK-NEXT: csel x0, [[XREG_6]], [[XREG_4]], ge
+; CHECK-NEXT: asr [[SMALLSHIFT_HI:x[0-9]+]], x1, x2
+; CHECK-NEXT: asr [[BIGSHIFT_HI:x[0-9]+]], x1, #63
+; CHECK-NEXT: csel x1, [[BIGSHIFT_HI]], [[SMALLSHIFT_HI]], ge
+; CHECK-NEXT: ret
+
+ %shr = ashr i128 %r, %s
+ ret i128 %shr
+}
+
+define i128 @lshr(i128 %r, i128 %s) nounwind readnone {
+; CHECK-LABEL: lshr:
+; CHECK: lsr [[XREG_0:x[0-9]+]], x0, x2
+; CHECK-NEXT: orr w[[XREG_1:[0-9]+]], wzr, #0x40
+; CHECK-NEXT: sub [[XREG_2:x[0-9]+]], x[[XREG_1]], x2
+; CHECK-NEXT: lsl [[XREG_3:x[0-9]+]], x1, [[XREG_2]]
+; CHECK-NEXT: orr [[XREG_4:x[0-9]+]], [[XREG_0]], [[XREG_3]]
+; CHECK-NEXT: sub [[XREG_5:x[0-9]+]], x2, #64
+; CHECK-NEXT: lsr [[XREG_6:x[0-9]+]], x1, [[XREG_5]]
+; CHECK-NEXT: cmp [[XREG_5]], #0
+; CHECK-NEXT: csel x0, [[XREG_6]], [[XREG_4]], ge
+; CHECK-NEXT: lsr [[SMALLSHIFT_HI:x[0-9]+]], x1, x2
+; CHECK-NEXT: csel x1, xzr, [[SMALLSHIFT_HI]], ge
+; CHECK-NEXT: ret
+
+ %shr = lshr i128 %r, %s
+ ret i128 %shr
+}
diff --git a/test/CodeGen/AArch64/arm64-memcpy-inline.ll b/test/CodeGen/AArch64/arm64-memcpy-inline.ll
new file mode 100644
index 0000000..f921a59
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-memcpy-inline.ll
@@ -0,0 +1,112 @@
+; RUN: llc < %s -march=arm64 -mcpu=cyclone | FileCheck %s
+
+%struct.x = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 }
+
+@src = external global %struct.x
+@dst = external global %struct.x
+
+@.str1 = private unnamed_addr constant [31 x i8] c"DHRYSTONE PROGRAM, SOME STRING\00", align 1
+@.str2 = private unnamed_addr constant [36 x i8] c"DHRYSTONE PROGRAM, SOME STRING BLAH\00", align 1
+@.str3 = private unnamed_addr constant [24 x i8] c"DHRYSTONE PROGRAM, SOME\00", align 1
+@.str4 = private unnamed_addr constant [18 x i8] c"DHRYSTONE PROGR \00", align 1
+@.str5 = private unnamed_addr constant [7 x i8] c"DHRYST\00", align 1
+@.str6 = private unnamed_addr constant [14 x i8] c"/tmp/rmXXXXXX\00", align 1
+@spool.splbuf = internal global [512 x i8] zeroinitializer, align 16
+
+define i32 @t0() {
+entry:
+; CHECK-LABEL: t0:
+; CHECK: ldrb [[REG0:w[0-9]+]], [x[[BASEREG:[0-9]+]], #10]
+; CHECK: strb [[REG0]], [x[[BASEREG2:[0-9]+]], #10]
+; CHECK: ldrh [[REG1:w[0-9]+]], [x[[BASEREG]], #8]
+; CHECK: strh [[REG1]], [x[[BASEREG2]], #8]
+; CHECK: ldr [[REG2:x[0-9]+]],
+; CHECK: str [[REG2]],
+ call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds (%struct.x* @dst, i32 0, i32 0), i8* getelementptr inbounds (%struct.x* @src, i32 0, i32 0), i32 11, i32 8, i1 false)
+ ret i32 0
+}
+
+define void @t1(i8* nocapture %C) nounwind {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: ldur [[DEST:q[0-9]+]], [x[[BASEREG:[0-9]+]], #15]
+; CHECK: stur [[DEST]], [x0, #15]
+; CHECK: ldr [[DEST:q[0-9]+]], [x[[BASEREG]]]
+; CHECK: str [[DEST]], [x0]
+ tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([31 x i8]* @.str1, i64 0, i64 0), i64 31, i32 1, i1 false)
+ ret void
+}
+
+define void @t2(i8* nocapture %C) nounwind {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: movz [[REG3:w[0-9]+]]
+; CHECK: movk [[REG3]],
+; CHECK: str [[REG3]], [x0, #32]
+; CHECK: ldp [[DEST1:q[0-9]+]], [[DEST2:q[0-9]+]], [x{{[0-9]+}}]
+; CHECK: stp [[DEST1]], [[DEST2]], [x0]
+ tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([36 x i8]* @.str2, i64 0, i64 0), i64 36, i32 1, i1 false)
+ ret void
+}
+
+define void @t3(i8* nocapture %C) nounwind {
+entry:
+; CHECK-LABEL: t3:
+; CHECK: ldr [[REG4:x[0-9]+]], [x[[BASEREG:[0-9]+]], #16]
+; CHECK: str [[REG4]], [x0, #16]
+; CHECK: ldr [[DEST:q[0-9]+]], [x[[BASEREG]]]
+; CHECK: str [[DEST]], [x0]
+ tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([24 x i8]* @.str3, i64 0, i64 0), i64 24, i32 1, i1 false)
+ ret void
+}
+
+define void @t4(i8* nocapture %C) nounwind {
+entry:
+; CHECK-LABEL: t4:
+; CHECK: orr [[REG5:w[0-9]+]], wzr, #0x20
+; CHECK: strh [[REG5]], [x0, #16]
+; CHECK: ldr [[REG6:q[0-9]+]], [x{{[0-9]+}}]
+; CHECK: str [[REG6]], [x0]
+ tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([18 x i8]* @.str4, i64 0, i64 0), i64 18, i32 1, i1 false)
+ ret void
+}
+
+define void @t5(i8* nocapture %C) nounwind {
+entry:
+; CHECK-LABEL: t5:
+; CHECK: strb wzr, [x0, #6]
+; CHECK: movz [[REG7:w[0-9]+]], #0x5453
+; CHECK: strh [[REG7]], [x0, #4]
+; CHECK: movz [[REG8:w[0-9]+]],
+; CHECK: movk [[REG8]],
+; CHECK: str [[REG8]], [x0]
+ tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([7 x i8]* @.str5, i64 0, i64 0), i64 7, i32 1, i1 false)
+ ret void
+}
+
+define void @t6() nounwind {
+entry:
+; CHECK-LABEL: t6:
+; CHECK: ldur [[REG9:x[0-9]+]], [x{{[0-9]+}}, #6]
+; CHECK: stur [[REG9]], [x{{[0-9]+}}, #6]
+; CHECK: ldr
+; CHECK: str
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([512 x i8]* @spool.splbuf, i64 0, i64 0), i8* getelementptr inbounds ([14 x i8]* @.str6, i64 0, i64 0), i64 14, i32 1, i1 false)
+ ret void
+}
+
+%struct.Foo = type { i32, i32, i32, i32 }
+
+define void @t7(%struct.Foo* nocapture %a, %struct.Foo* nocapture %b) nounwind {
+entry:
+; CHECK: t7
+; CHECK: ldr [[REG10:q[0-9]+]], [x1]
+; CHECK: str [[REG10]], [x0]
+ %0 = bitcast %struct.Foo* %a to i8*
+ %1 = bitcast %struct.Foo* %b to i8*
+ tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %0, i8* %1, i32 16, i32 4, i1 false)
+ ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
diff --git a/test/CodeGen/AArch64/arm64-memset-inline.ll b/test/CodeGen/AArch64/arm64-memset-inline.ll
new file mode 100644
index 0000000..2e237f4
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-memset-inline.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define void @t1(i8* nocapture %c) nounwind optsize {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: str wzr, [x0, #8]
+; CHECK: str xzr, [x0]
+ call void @llvm.memset.p0i8.i64(i8* %c, i8 0, i64 12, i32 8, i1 false)
+ ret void
+}
+
+define void @t2() nounwind ssp {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: strh wzr, [sp, #32]
+; CHECK: stp xzr, xzr, [sp, #16]
+; CHECK: str xzr, [sp, #8]
+ %buf = alloca [26 x i8], align 1
+ %0 = getelementptr inbounds [26 x i8]* %buf, i32 0, i32 0
+ call void @llvm.memset.p0i8.i32(i8* %0, i8 0, i32 26, i32 1, i1 false)
+ call void @something(i8* %0) nounwind
+ ret void
+}
+
+declare void @something(i8*) nounwind
+declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
diff --git a/test/CodeGen/AArch64/arm64-memset-to-bzero.ll b/test/CodeGen/AArch64/arm64-memset-to-bzero.ll
new file mode 100644
index 0000000..29036ca
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-memset-to-bzero.ll
@@ -0,0 +1,108 @@
+; RUN: llc %s -mtriple=arm64-apple-darwin -o - | \
+; RUN: FileCheck --check-prefix=CHECK-DARWIN --check-prefix=CHECK %s
+; RUN: llc %s -mtriple=arm64-linux-gnu -o - | \
+; RUN: FileCheck --check-prefix=CHECK-LINUX --check-prefix=CHECK %s
+; <rdar://problem/14199482> ARM64: Calls to bzero() replaced with calls to memset()
+
+; CHECK: @fct1
+; For small size (<= 256), we do not change memset to bzero.
+; CHECK: memset
+define void @fct1(i8* nocapture %ptr) {
+entry:
+ tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 256, i32 1, i1 false)
+ ret void
+}
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
+
+; CHECK: @fct2
+; When the size is bigger than 256, change into bzero.
+; CHECK-DARWIN: bzero
+; CHECK-LINUX: memset
+define void @fct2(i8* nocapture %ptr) {
+entry:
+ tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 257, i32 1, i1 false)
+ ret void
+}
+
+; CHECK: @fct3
+; For unknown size, change to bzero.
+; CHECK-DARWIN: bzero
+; CHECK-LINUX: memset
+define void @fct3(i8* nocapture %ptr, i32 %unknown) {
+entry:
+ %conv = sext i32 %unknown to i64
+ tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 %conv, i32 1, i1 false)
+ ret void
+}
+
+; CHECK: @fct4
+; Size <= 256, no change.
+; CHECK: memset
+define void @fct4(i8* %ptr) {
+entry:
+ %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+ %call = tail call i8* @__memset_chk(i8* %ptr, i32 0, i64 256, i64 %tmp)
+ ret void
+}
+
+declare i8* @__memset_chk(i8*, i32, i64, i64)
+
+declare i64 @llvm.objectsize.i64(i8*, i1)
+
+; CHECK: @fct5
+; Size > 256, change.
+; CHECK-DARWIN: bzero
+; CHECK-LINUX: memset
+define void @fct5(i8* %ptr) {
+entry:
+ %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+ %call = tail call i8* @__memset_chk(i8* %ptr, i32 0, i64 257, i64 %tmp)
+ ret void
+}
+
+; CHECK: @fct6
+; Size = unknown, change.
+; CHECK-DARWIN: bzero
+; CHECK-LINUX: memset
+define void @fct6(i8* %ptr, i32 %unknown) {
+entry:
+ %conv = sext i32 %unknown to i64
+ %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+ %call = tail call i8* @__memset_chk(i8* %ptr, i32 0, i64 %conv, i64 %tmp)
+ ret void
+}
+
+; Next functions check that memset is not turned into bzero
+; when the set constant is non-zero, whatever the given size.
+
+; CHECK: @fct7
+; memset with something that is not a zero, no change.
+; CHECK: memset
+define void @fct7(i8* %ptr) {
+entry:
+ %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+ %call = tail call i8* @__memset_chk(i8* %ptr, i32 1, i64 256, i64 %tmp)
+ ret void
+}
+
+; CHECK: @fct8
+; memset with something that is not a zero, no change.
+; CHECK: memset
+define void @fct8(i8* %ptr) {
+entry:
+ %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+ %call = tail call i8* @__memset_chk(i8* %ptr, i32 1, i64 257, i64 %tmp)
+ ret void
+}
+
+; CHECK: @fct9
+; memset with something that is not a zero, no change.
+; CHECK: memset
+define void @fct9(i8* %ptr, i32 %unknown) {
+entry:
+ %conv = sext i32 %unknown to i64
+ %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+ %call = tail call i8* @__memset_chk(i8* %ptr, i32 1, i64 %conv, i64 %tmp)
+ ret void
+}
diff --git a/test/CodeGen/AArch64/misched-basic-A53.ll b/test/CodeGen/AArch64/arm64-misched-basic-A53.ll
index 1555c48..f88bd6a 100644
--- a/test/CodeGen/AArch64/misched-basic-A53.ll
+++ b/test/CodeGen/AArch64/arm64-misched-basic-A53.ll
@@ -1,5 +1,5 @@
; REQUIRES: asserts
-; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a53 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a53 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
;
; The Cortex-A53 machine model will cause the MADD instruction to be scheduled
; much higher than the ADD instructions in order to hide latency. When not
@@ -8,10 +8,8 @@
; CHECK: ********** MI Scheduling **********
; CHECK: main
; CHECK: *** Final schedule for BB#2 ***
-; CHECK: SU(13)
-; CHECK: MADDwwww
-; CHECK: SU(4)
-; CHECK: ADDwwi_lsl0_s
+; CHECK: MADDWrrr
+; CHECK: ADDWri
; CHECK: ********** INTERVALS **********
@main.x = private unnamed_addr constant [8 x i32] [i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1], align 4
@main.y = private unnamed_addr constant [8 x i32] [i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2], align 4
@@ -86,9 +84,9 @@ for.end: ; preds = %for.cond
; CHECK: ********** MI Scheduling **********
; CHECK: neon4xfloat:BB#0
; CHECK: *** Final schedule for BB#0 ***
-; CHECK: FDIVvvv_4S
-; CHECK: FADDvvv_4S
-; CHECK: FADDvvv_4S
+; CHECK: FDIVv4f32
+; CHECK: FADDv4f32
+; CHECK: FADDv4f32
; CHECK: ********** INTERVALS **********
define <4 x float> @neon4xfloat(<4 x float> %A, <4 x float> %B) {
%tmp1 = fadd <4 x float> %A, %B;
@@ -110,3 +108,17 @@ declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i
attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind }
+
+
+; Regression Test for PR19761
+; [ARM64] Cortex-a53 schedule mode can't handle NEON post-increment load
+;
+; Nothing explicit to check other than llc not crashing.
+define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld2(i8* %A, i8** %ptr) {
+ %ld2 = tail call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0i8(i8* %A)
+ %tmp = getelementptr i8* %A, i32 32
+ store i8* %tmp, i8** %ptr
+ ret { <16 x i8>, <16 x i8> } %ld2
+}
+
+declare { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0i8(i8*)
diff --git a/test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll b/test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll
new file mode 100644
index 0000000..97bfb5c
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll
@@ -0,0 +1,21 @@
+; REQUIRES: asserts
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a53 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
+;
+; For Cortex-A53, shiftable operands that are not actually shifted
+; are not needed for an additional two cycles.
+;
+; CHECK: ********** MI Scheduling **********
+; CHECK: shiftable
+; CHECK: *** Final schedule for BB#0 ***
+; CHECK: ADDXrr %vreg0, %vreg2
+; CHECK: ADDXrs %vreg0, %vreg2, 5
+; CHECK: ********** INTERVALS **********
+define i64 @shiftable(i64 %A, i64 %B) {
+ %tmp0 = sub i64 %B, 20
+ %tmp1 = shl i64 %tmp0, 5;
+ %tmp2 = add i64 %A, %tmp1;
+ %tmp3 = add i64 %A, %tmp0
+ %tmp4 = mul i64 %tmp2, %tmp3
+
+ ret i64 %tmp4
+}
diff --git a/test/CodeGen/AArch64/arm64-movi.ll b/test/CodeGen/AArch64/arm64-movi.ll
new file mode 100644
index 0000000..2cd368d
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-movi.ll
@@ -0,0 +1,202 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+;==--------------------------------------------------------------------------==
+; Tests for MOV-immediate implemented with ORR-immediate.
+;==--------------------------------------------------------------------------==
+
+; 64-bit immed with 32-bit pattern size, rotated by 0.
+define i64 @test64_32_rot0() nounwind {
+; CHECK-LABEL: test64_32_rot0:
+; CHECK: orr x0, xzr, #0x700000007
+ ret i64 30064771079
+}
+
+; 64-bit immed with 32-bit pattern size, rotated by 2.
+define i64 @test64_32_rot2() nounwind {
+; CHECK-LABEL: test64_32_rot2:
+; CHECK: orr x0, xzr, #0xc0000003c0000003
+ ret i64 13835058071388291075
+}
+
+; 64-bit immed with 4-bit pattern size, rotated by 3.
+define i64 @test64_4_rot3() nounwind {
+; CHECK-LABEL: test64_4_rot3:
+; CHECK: orr x0, xzr, #0xeeeeeeeeeeeeeeee
+ ret i64 17216961135462248174
+}
+
+; 32-bit immed with 32-bit pattern size, rotated by 16.
+define i32 @test32_32_rot16() nounwind {
+; CHECK-LABEL: test32_32_rot16:
+; CHECK: orr w0, wzr, #0xff0000
+ ret i32 16711680
+}
+
+; 32-bit immed with 2-bit pattern size, rotated by 1.
+define i32 @test32_2_rot1() nounwind {
+; CHECK-LABEL: test32_2_rot1:
+; CHECK: orr w0, wzr, #0xaaaaaaaa
+ ret i32 2863311530
+}
+
+;==--------------------------------------------------------------------------==
+; Tests for MOVZ with MOVK.
+;==--------------------------------------------------------------------------==
+
+define i32 @movz() nounwind {
+; CHECK-LABEL: movz:
+; CHECK: movz w0, #0x5
+ ret i32 5
+}
+
+define i64 @movz_3movk() nounwind {
+; CHECK-LABEL: movz_3movk:
+; CHECK: movz x0, #0x5, lsl #48
+; CHECK-NEXT: movk x0, #0x1234, lsl #32
+; CHECK-NEXT: movk x0, #0xabcd, lsl #16
+; CHECK-NEXT: movk x0, #0x5678
+ ret i64 1427392313513592
+}
+
+define i64 @movz_movk_skip1() nounwind {
+; CHECK-LABEL: movz_movk_skip1:
+; CHECK: movz x0, #0x5, lsl #32
+; CHECK-NEXT: movk x0, #0x4321, lsl #16
+ ret i64 22601072640
+}
+
+define i64 @movz_skip1_movk() nounwind {
+; CHECK-LABEL: movz_skip1_movk:
+; CHECK: movz x0, #0x8654, lsl #32
+; CHECK-NEXT: movk x0, #0x1234
+ ret i64 147695335379508
+}
+
+;==--------------------------------------------------------------------------==
+; Tests for MOVN with MOVK.
+;==--------------------------------------------------------------------------==
+
+define i64 @movn() nounwind {
+; CHECK-LABEL: movn:
+; CHECK: movn x0, #0x29
+ ret i64 -42
+}
+
+define i64 @movn_skip1_movk() nounwind {
+; CHECK-LABEL: movn_skip1_movk:
+; CHECK: movn x0, #0x29, lsl #32
+; CHECK-NEXT: movk x0, #0x1234
+ ret i64 -176093720012
+}
+
+;==--------------------------------------------------------------------------==
+; Tests for ORR with MOVK.
+;==--------------------------------------------------------------------------==
+; rdar://14987673
+
+define i64 @orr_movk1() nounwind {
+; CHECK-LABEL: orr_movk1:
+; CHECK: orr x0, xzr, #0xffff0000ffff0
+; CHECK: movk x0, #0xdead, lsl #16
+ ret i64 72056498262245120
+}
+
+define i64 @orr_movk2() nounwind {
+; CHECK-LABEL: orr_movk2:
+; CHECK: orr x0, xzr, #0xffff0000ffff0
+; CHECK: movk x0, #0xdead, lsl #48
+ ret i64 -2400982650836746496
+}
+
+define i64 @orr_movk3() nounwind {
+; CHECK-LABEL: orr_movk3:
+; CHECK: orr x0, xzr, #0xffff0000ffff0
+; CHECK: movk x0, #0xdead, lsl #32
+ ret i64 72020953688702720
+}
+
+define i64 @orr_movk4() nounwind {
+; CHECK-LABEL: orr_movk4:
+; CHECK: orr x0, xzr, #0xffff0000ffff0
+; CHECK: movk x0, #0xdead
+ ret i64 72056494543068845
+}
+
+; rdar://14987618
+define i64 @orr_movk5() nounwind {
+; CHECK-LABEL: orr_movk5:
+; CHECK: orr x0, xzr, #0xff00ff00ff00ff00
+; CHECK: movk x0, #0xdead, lsl #16
+ ret i64 -71777214836900096
+}
+
+define i64 @orr_movk6() nounwind {
+; CHECK-LABEL: orr_movk6:
+; CHECK: orr x0, xzr, #0xff00ff00ff00ff00
+; CHECK: movk x0, #0xdead, lsl #16
+; CHECK: movk x0, #0xdead, lsl #48
+ ret i64 -2400982647117578496
+}
+
+define i64 @orr_movk7() nounwind {
+; CHECK-LABEL: orr_movk7:
+; CHECK: orr x0, xzr, #0xff00ff00ff00ff00
+; CHECK: movk x0, #0xdead, lsl #48
+ ret i64 -2400982646575268096
+}
+
+define i64 @orr_movk8() nounwind {
+; CHECK-LABEL: orr_movk8:
+; CHECK: orr x0, xzr, #0xff00ff00ff00ff00
+; CHECK: movk x0, #0xdead
+; CHECK: movk x0, #0xdead, lsl #48
+ ret i64 -2400982646575276371
+}
+
+; rdar://14987715
+define i64 @orr_movk9() nounwind {
+; CHECK-LABEL: orr_movk9:
+; CHECK: orr x0, xzr, #0xffffff000000000
+; CHECK: movk x0, #0xff00
+; CHECK: movk x0, #0xdead, lsl #16
+ ret i64 1152921439623315200
+}
+
+define i64 @orr_movk10() nounwind {
+; CHECK-LABEL: orr_movk10:
+; CHECK: orr x0, xzr, #0xfffffffffffff00
+; CHECK: movk x0, #0xdead, lsl #16
+ ret i64 1152921504047824640
+}
+
+define i64 @orr_movk11() nounwind {
+; CHECK-LABEL: orr_movk11:
+; CHECK: orr x0, xzr, #0xfff00000000000ff
+; CHECK: movk x0, #0xdead, lsl #16
+; CHECK: movk x0, #0xffff, lsl #32
+ ret i64 -4222125209747201
+}
+
+define i64 @orr_movk12() nounwind {
+; CHECK-LABEL: orr_movk12:
+; CHECK: orr x0, xzr, #0xfff00000000000ff
+; CHECK: movk x0, #0xdead, lsl #32
+ ret i64 -4258765016661761
+}
+
+define i64 @orr_movk13() nounwind {
+; CHECK-LABEL: orr_movk13:
+; CHECK: orr x0, xzr, #0xfffff000000
+; CHECK: movk x0, #0xdead
+; CHECK: movk x0, #0xdead, lsl #48
+ ret i64 -2401245434149282131
+}
+
+; rdar://13944082
+define i64 @g() nounwind {
+; CHECK-LABEL: g:
+; CHECK: movz x0, #0xffff, lsl #48
+; CHECK: movk x0, #0x2
+entry:
+ ret i64 -281474976710654
+}
diff --git a/test/CodeGen/AArch64/arm64-mul.ll b/test/CodeGen/AArch64/arm64-mul.ll
new file mode 100644
index 0000000..2e7986d
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-mul.ll
@@ -0,0 +1,90 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+; rdar://9296808
+; rdar://9349137
+
+define i128 @t1(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: mul {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: umulh {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+ %tmp1 = zext i64 %a to i128
+ %tmp2 = zext i64 %b to i128
+ %tmp3 = mul i128 %tmp1, %tmp2
+ ret i128 %tmp3
+}
+
+define i128 @t2(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: mul {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: smulh {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+ %tmp1 = sext i64 %a to i128
+ %tmp2 = sext i64 %b to i128
+ %tmp3 = mul i128 %tmp1, %tmp2
+ ret i128 %tmp3
+}
+
+define i64 @t3(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK-LABEL: t3:
+; CHECK: umull {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+ %tmp1 = zext i32 %a to i64
+ %tmp2 = zext i32 %b to i64
+ %tmp3 = mul i64 %tmp1, %tmp2
+ ret i64 %tmp3
+}
+
+define i64 @t4(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK-LABEL: t4:
+; CHECK: smull {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+ %tmp1 = sext i32 %a to i64
+ %tmp2 = sext i32 %b to i64
+ %tmp3 = mul i64 %tmp1, %tmp2
+ ret i64 %tmp3
+}
+
+define i64 @t5(i32 %a, i32 %b, i64 %c) nounwind {
+entry:
+; CHECK-LABEL: t5:
+; CHECK: umaddl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{x[0-9]+}}
+ %tmp1 = zext i32 %a to i64
+ %tmp2 = zext i32 %b to i64
+ %tmp3 = mul i64 %tmp1, %tmp2
+ %tmp4 = add i64 %c, %tmp3
+ ret i64 %tmp4
+}
+
+define i64 @t6(i32 %a, i32 %b, i64 %c) nounwind {
+entry:
+; CHECK-LABEL: t6:
+; CHECK: smsubl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{x[0-9]+}}
+ %tmp1 = sext i32 %a to i64
+ %tmp2 = sext i32 %b to i64
+ %tmp3 = mul i64 %tmp1, %tmp2
+ %tmp4 = sub i64 %c, %tmp3
+ ret i64 %tmp4
+}
+
+define i64 @t7(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK-LABEL: t7:
+; CHECK: umnegl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+ %tmp1 = zext i32 %a to i64
+ %tmp2 = zext i32 %b to i64
+ %tmp3 = mul i64 %tmp1, %tmp2
+ %tmp4 = sub i64 0, %tmp3
+ ret i64 %tmp4
+}
+
+define i64 @t8(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK-LABEL: t8:
+; CHECK: smnegl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+ %tmp1 = sext i32 %a to i64
+ %tmp2 = sext i32 %b to i64
+ %tmp3 = mul i64 %tmp1, %tmp2
+ %tmp4 = sub i64 0, %tmp3
+ ret i64 %tmp4
+}
diff --git a/test/CodeGen/AArch64/arm64-named-reg-alloc.ll b/test/CodeGen/AArch64/arm64-named-reg-alloc.ll
new file mode 100644
index 0000000..d86d2e6
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-named-reg-alloc.ll
@@ -0,0 +1,14 @@
+; RUN: not llc < %s -mtriple=arm64-apple-darwin 2>&1 | FileCheck %s
+; RUN: not llc < %s -mtriple=arm64-linux-gnueabi 2>&1 | FileCheck %s
+
+define i32 @get_stack() nounwind {
+entry:
+; FIXME: Include an allocatable-specific error message
+; CHECK: Invalid register name global variable
+ %sp = call i32 @llvm.read_register.i32(metadata !0)
+ ret i32 %sp
+}
+
+declare i32 @llvm.read_register.i32(metadata) nounwind
+
+!0 = metadata !{metadata !"x5\00"}
diff --git a/test/CodeGen/AArch64/arm64-named-reg-notareg.ll b/test/CodeGen/AArch64/arm64-named-reg-notareg.ll
new file mode 100644
index 0000000..3ca14c4
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-named-reg-notareg.ll
@@ -0,0 +1,13 @@
+; RUN: not llc < %s -mtriple=arm64-apple-darwin 2>&1 | FileCheck %s
+; RUN: not llc < %s -mtriple=arm64-linux-gnueabi 2>&1 | FileCheck %s
+
+define i32 @get_stack() nounwind {
+entry:
+; CHECK: Invalid register name global variable
+ %sp = call i32 @llvm.read_register.i32(metadata !0)
+ ret i32 %sp
+}
+
+declare i32 @llvm.read_register.i32(metadata) nounwind
+
+!0 = metadata !{metadata !"notareg\00"}
diff --git a/test/CodeGen/AArch64/arm64-neg.ll b/test/CodeGen/AArch64/arm64-neg.ll
new file mode 100644
index 0000000..659ce98
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neg.ll
@@ -0,0 +1,71 @@
+; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
+
+define i32 @test_neg_i32(i32 %in) {
+; CHECK-LABEL: test_neg_i32:
+; CHECK: neg w0, w0
+ %res = sub i32 0, %in
+ ret i32 %res
+}
+
+define i64 @test_neg_i64(i64 %in) {
+; CHECK-LABEL: test_neg_i64:
+; CHECK: neg x0, x0
+ %res = sub i64 0, %in
+ ret i64 %res
+}
+
+define <8 x i8> @test_neg_v8i8(<8 x i8> %in) {
+; CHECK-LABEL: test_neg_v8i8:
+; CHECK: neg v0.8b, v0.8b
+ %res = sub <8 x i8> zeroinitializer, %in
+ ret <8 x i8> %res
+}
+
+define <4 x i16> @test_neg_v4i16(<4 x i16> %in) {
+; CHECK-LABEL: test_neg_v4i16:
+; CHECK: neg v0.4h, v0.4h
+ %res = sub <4 x i16> zeroinitializer, %in
+ ret <4 x i16> %res
+}
+
+define <2 x i32> @test_neg_v2i32(<2 x i32> %in) {
+; CHECK-LABEL: test_neg_v2i32:
+; CHECK: neg v0.2s, v0.2s
+ %res = sub <2 x i32> zeroinitializer, %in
+ ret <2 x i32> %res
+}
+
+define <16 x i8> @test_neg_v16i8(<16 x i8> %in) {
+; CHECK-LABEL: test_neg_v16i8:
+; CHECK: neg v0.16b, v0.16b
+ %res = sub <16 x i8> zeroinitializer, %in
+ ret <16 x i8> %res
+}
+
+define <8 x i16> @test_neg_v8i16(<8 x i16> %in) {
+; CHECK-LABEL: test_neg_v8i16:
+; CHECK: neg v0.8h, v0.8h
+ %res = sub <8 x i16> zeroinitializer, %in
+ ret <8 x i16> %res
+}
+
+define <4 x i32> @test_neg_v4i32(<4 x i32> %in) {
+; CHECK-LABEL: test_neg_v4i32:
+; CHECK: neg v0.4s, v0.4s
+ %res = sub <4 x i32> zeroinitializer, %in
+ ret <4 x i32> %res
+}
+
+define <2 x i64> @test_neg_v2i64(<2 x i64> %in) {
+; CHECK-LABEL: test_neg_v2i64:
+; CHECK: neg v0.2d, v0.2d
+ %res = sub <2 x i64> zeroinitializer, %in
+ ret <2 x i64> %res
+}
+
+define <1 x i64> @test_neg_v1i64(<1 x i64> %in) {
+; CHECK-LABEL: test_neg_v1i64:
+; CHECK: neg d0, d0
+ %res = sub <1 x i64> zeroinitializer, %in
+ ret <1 x i64> %res
+}
diff --git a/test/CodeGen/AArch64/neon-2velem-high.ll b/test/CodeGen/AArch64/arm64-neon-2velem-high.ll
index 97031d9..58df094 100644
--- a/test/CodeGen/AArch64/neon-2velem-high.ll
+++ b/test/CodeGen/AArch64/arm64-neon-2velem-high.ll
@@ -1,259 +1,269 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
-declare <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>)
-declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>)
+declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
-declare <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>)
+declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>)
-declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
-declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>)
+declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
-declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
-declare <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32>, <2 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>)
-declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>)
+declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
-declare <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32>, <2 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>)
-declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>)
+declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
define <4 x i32> @test_vmull_high_n_s16(<8 x i16> %a, i16 %b) {
-; CHECK: test_vmull_high_n_s16:
-; CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-LABEL: test_vmull_high_n_s16:
+; CHECK: dup [[REPLICATE:v[0-9]+]].8h, w0
+; CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
entry:
%shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0
%vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1
%vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2
%vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3
- %vmull15.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+ %vmull15.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
ret <4 x i32> %vmull15.i.i
}
define <2 x i64> @test_vmull_high_n_s32(<4 x i32> %a, i32 %b) {
-; CHECK: test_vmull_high_n_s32:
-; CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-LABEL: test_vmull_high_n_s32:
+; CHECK: dup [[REPLICATE:v[0-9]+]].4s, w0
+; CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
entry:
%shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0
%vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1
- %vmull9.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+ %vmull9.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
ret <2 x i64> %vmull9.i.i
}
define <4 x i32> @test_vmull_high_n_u16(<8 x i16> %a, i16 %b) {
-; CHECK: test_vmull_high_n_u16:
-; CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-LABEL: test_vmull_high_n_u16:
+; CHECK: dup [[REPLICATE:v[0-9]+]].8h, w0
+; CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
entry:
%shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0
%vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1
%vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2
%vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3
- %vmull15.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+ %vmull15.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
ret <4 x i32> %vmull15.i.i
}
define <2 x i64> @test_vmull_high_n_u32(<4 x i32> %a, i32 %b) {
-; CHECK: test_vmull_high_n_u32:
-; CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-LABEL: test_vmull_high_n_u32:
+; CHECK: dup [[REPLICATE:v[0-9]+]].4s, w0
+; CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
entry:
%shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0
%vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1
- %vmull9.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+ %vmull9.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
ret <2 x i64> %vmull9.i.i
}
define <4 x i32> @test_vqdmull_high_n_s16(<8 x i16> %a, i16 %b) {
-; CHECK: test_vqdmull_high_n_s16:
-; CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK-LABEL: test_vqdmull_high_n_s16:
+; CHECK: dup [[REPLICATE:v[0-9]+]].8h, w0
+; CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
entry:
%shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0
%vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1
%vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2
%vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3
- %vqdmull15.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+ %vqdmull15.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
ret <4 x i32> %vqdmull15.i.i
}
define <2 x i64> @test_vqdmull_high_n_s32(<4 x i32> %a, i32 %b) {
-; CHECK: test_vqdmull_high_n_s32:
-; CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK-LABEL: test_vqdmull_high_n_s32:
+; CHECK: dup [[REPLICATE:v[0-9]+]].4s, w0
+; CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
entry:
%shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0
%vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1
- %vqdmull9.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+ %vqdmull9.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
ret <2 x i64> %vqdmull9.i.i
}
define <4 x i32> @test_vmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
-; CHECK: test_vmlal_high_n_s16:
-; CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}]
+; CHECK-LABEL: test_vmlal_high_n_s16:
+; CHECK: dup [[REPLICATE:v[0-9]+]].8h, w0
+; CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
entry:
%shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
%vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
%vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
%vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
- %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+ %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
%add.i.i = add <4 x i32> %vmull2.i.i.i, %a
ret <4 x i32> %add.i.i
}
define <2 x i64> @test_vmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
-; CHECK: test_vmlal_high_n_s32:
-; CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
+; CHECK-LABEL: test_vmlal_high_n_s32:
+; CHECK: dup [[REPLICATE:v[0-9]+]].4s, w0
+; CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
entry:
%shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
%vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
- %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+ %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
%add.i.i = add <2 x i64> %vmull2.i.i.i, %a
ret <2 x i64> %add.i.i
}
define <4 x i32> @test_vmlal_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
-; CHECK: test_vmlal_high_n_u16:
-; CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}]
+; CHECK-LABEL: test_vmlal_high_n_u16:
+; CHECK: dup [[REPLICATE:v[0-9]+]].8h, w0
+; CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, [[REPLICATE]].8h
entry:
%shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
%vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
%vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
%vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
- %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+ %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
%add.i.i = add <4 x i32> %vmull2.i.i.i, %a
ret <4 x i32> %add.i.i
}
define <2 x i64> @test_vmlal_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
-; CHECK: test_vmlal_high_n_u32:
-; CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
+; CHECK-LABEL: test_vmlal_high_n_u32:
+; CHECK: dup [[REPLICATE:v[0-9]+]].4s, w0
+; CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, [[REPLICATE]].4s
entry:
%shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
%vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
- %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+ %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
%add.i.i = add <2 x i64> %vmull2.i.i.i, %a
ret <2 x i64> %add.i.i
}
define <4 x i32> @test_vqdmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
-; CHECK: test_vqdmlal_high_n_s16:
-; CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}]
+; CHECK-LABEL: test_vqdmlal_high_n_s16:
+; CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
%vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
%vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
%vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
- %vqdmlal15.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
- %vqdmlal17.i.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal15.i.i)
+ %vqdmlal15.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+ %vqdmlal17.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal15.i.i)
ret <4 x i32> %vqdmlal17.i.i
}
define <2 x i64> @test_vqdmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
-; CHECK: test_vqdmlal_high_n_s32:
-; CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
+; CHECK-LABEL: test_vqdmlal_high_n_s32:
+; CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
%vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
- %vqdmlal9.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
- %vqdmlal11.i.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal9.i.i)
+ %vqdmlal9.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+ %vqdmlal11.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal9.i.i)
ret <2 x i64> %vqdmlal11.i.i
}
define <4 x i32> @test_vmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
-; CHECK: test_vmlsl_high_n_s16:
-; CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}]
+; CHECK-LABEL: test_vmlsl_high_n_s16:
+; CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
%vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
%vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
%vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
- %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+ %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
%sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
ret <4 x i32> %sub.i.i
}
define <2 x i64> @test_vmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
-; CHECK: test_vmlsl_high_n_s32:
-; CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
+; CHECK-LABEL: test_vmlsl_high_n_s32:
+; CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
%vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
- %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+ %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
%sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
ret <2 x i64> %sub.i.i
}
define <4 x i32> @test_vmlsl_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
-; CHECK: test_vmlsl_high_n_u16:
-; CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}]
+; CHECK-LABEL: test_vmlsl_high_n_u16:
+; CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
%vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
%vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
%vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
- %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+ %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
%sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
ret <4 x i32> %sub.i.i
}
define <2 x i64> @test_vmlsl_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
-; CHECK: test_vmlsl_high_n_u32:
-; CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
+; CHECK-LABEL: test_vmlsl_high_n_u32:
+; CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
%vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
- %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+ %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
%sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
ret <2 x i64> %sub.i.i
}
define <4 x i32> @test_vqdmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
-; CHECK: test_vqdmlsl_high_n_s16:
-; CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}]
+; CHECK-LABEL: test_vqdmlsl_high_n_s16:
+; CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
%vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
%vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
%vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
- %vqdmlsl15.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
- %vqdmlsl17.i.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl15.i.i)
+ %vqdmlsl15.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+ %vqdmlsl17.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl15.i.i)
ret <4 x i32> %vqdmlsl17.i.i
}
define <2 x i64> @test_vqdmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
-; CHECK: test_vqdmlsl_high_n_s32:
-; CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
+; CHECK-LABEL: test_vqdmlsl_high_n_s32:
+; CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
%vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
- %vqdmlsl9.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
- %vqdmlsl11.i.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl9.i.i)
+ %vqdmlsl9.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+ %vqdmlsl11.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl9.i.i)
ret <2 x i64> %vqdmlsl11.i.i
}
define <2 x float> @test_vmul_n_f32(<2 x float> %a, float %b) {
-; CHECK: test_vmul_n_f32:
+; CHECK-LABEL: test_vmul_n_f32:
; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
entry:
%vecinit.i = insertelement <2 x float> undef, float %b, i32 0
@@ -263,7 +273,7 @@ entry:
}
define <4 x float> @test_vmulq_n_f32(<4 x float> %a, float %b) {
-; CHECK: test_vmulq_n_f32:
+; CHECK-LABEL: test_vmulq_n_f32:
; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
entry:
%vecinit.i = insertelement <4 x float> undef, float %b, i32 0
@@ -275,7 +285,7 @@ entry:
}
define <2 x double> @test_vmulq_n_f64(<2 x double> %a, double %b) {
-; CHECK: test_vmulq_n_f64:
+; CHECK-LABEL: test_vmulq_n_f64:
; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
entry:
%vecinit.i = insertelement <2 x double> undef, double %b, i32 0
@@ -285,7 +295,7 @@ entry:
}
define <2 x float> @test_vfma_n_f32(<2 x float> %a, <2 x float> %b, float %n) {
-; CHECK: test_vfma_n_f32:
+; CHECK-LABEL: test_vfma_n_f32:
; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}]
entry:
%vecinit.i = insertelement <2 x float> undef, float %n, i32 0
@@ -295,7 +305,7 @@ entry:
}
define <4 x float> @test_vfmaq_n_f32(<4 x float> %a, <4 x float> %b, float %n) {
-; CHECK: test_vfmaq_n_f32:
+; CHECK-LABEL: test_vfmaq_n_f32:
; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
entry:
%vecinit.i = insertelement <4 x float> undef, float %n, i32 0
@@ -307,7 +317,7 @@ entry:
}
define <2 x float> @test_vfms_n_f32(<2 x float> %a, <2 x float> %b, float %n) {
-; CHECK: test_vfms_n_f32:
+; CHECK-LABEL: test_vfms_n_f32:
; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}]
entry:
%vecinit.i = insertelement <2 x float> undef, float %n, i32 0
@@ -318,7 +328,7 @@ entry:
}
define <4 x float> @test_vfmsq_n_f32(<4 x float> %a, <4 x float> %b, float %n) {
-; CHECK: test_vfmsq_n_f32:
+; CHECK-LABEL: test_vfmsq_n_f32:
; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
entry:
%vecinit.i = insertelement <4 x float> undef, float %n, i32 0
diff --git a/test/CodeGen/AArch64/neon-2velem.ll b/test/CodeGen/AArch64/arm64-neon-2velem.ll
index acffb14..869966c 100644
--- a/test/CodeGen/AArch64/neon-2velem.ll
+++ b/test/CodeGen/AArch64/arm64-neon-2velem.ll
@@ -1,49 +1,49 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
-declare <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double>, <2 x double>)
+declare <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double>, <2 x double>)
-declare <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float>, <4 x float>)
-declare <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float>, <2 x float>)
+declare <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float>, <2 x float>)
-declare <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
-declare <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>)
-declare <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>)
-declare <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>)
-declare <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>)
-declare <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>)
-declare <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>)
-declare <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>)
-declare <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>)
+declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>)
-declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>)
+declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
-declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
-declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>)
+declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
-declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
-declare <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32>, <2 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>)
-declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>)
+declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
-declare <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32>, <2 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>)
-declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>)
+declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
define <4 x i16> @test_vmla_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmla_lane_s16:
+; CHECK-LABEL: test_vmla_lane_s16:
; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
; CHECK-NEXT: ret
entry:
@@ -54,7 +54,7 @@ entry:
}
define <8 x i16> @test_vmlaq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlaq_lane_s16:
+; CHECK-LABEL: test_vmlaq_lane_s16:
; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
; CHECK-NEXT: ret
entry:
@@ -65,7 +65,7 @@ entry:
}
define <2 x i32> @test_vmla_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmla_lane_s32:
+; CHECK-LABEL: test_vmla_lane_s32:
; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
; CHECK-NEXT: ret
entry:
@@ -76,7 +76,7 @@ entry:
}
define <4 x i32> @test_vmlaq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlaq_lane_s32:
+; CHECK-LABEL: test_vmlaq_lane_s32:
; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
; CHECK-NEXT: ret
entry:
@@ -87,7 +87,7 @@ entry:
}
define <4 x i16> @test_vmla_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmla_laneq_s16:
+; CHECK-LABEL: test_vmla_laneq_s16:
; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
; CHECK-NEXT: ret
entry:
@@ -98,7 +98,7 @@ entry:
}
define <8 x i16> @test_vmlaq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlaq_laneq_s16:
+; CHECK-LABEL: test_vmlaq_laneq_s16:
; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
; CHECK-NEXT: ret
entry:
@@ -109,7 +109,7 @@ entry:
}
define <2 x i32> @test_vmla_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmla_laneq_s32:
+; CHECK-LABEL: test_vmla_laneq_s32:
; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
; CHECK-NEXT: ret
entry:
@@ -120,7 +120,7 @@ entry:
}
define <4 x i32> @test_vmlaq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlaq_laneq_s32:
+; CHECK-LABEL: test_vmlaq_laneq_s32:
; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
; CHECK-NEXT: ret
entry:
@@ -131,7 +131,7 @@ entry:
}
define <4 x i16> @test_vmls_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmls_lane_s16:
+; CHECK-LABEL: test_vmls_lane_s16:
; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
; CHECK-NEXT: ret
entry:
@@ -142,7 +142,7 @@ entry:
}
define <8 x i16> @test_vmlsq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlsq_lane_s16:
+; CHECK-LABEL: test_vmlsq_lane_s16:
; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
; CHECK-NEXT: ret
entry:
@@ -153,7 +153,7 @@ entry:
}
define <2 x i32> @test_vmls_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmls_lane_s32:
+; CHECK-LABEL: test_vmls_lane_s32:
; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
; CHECK-NEXT: ret
entry:
@@ -164,7 +164,7 @@ entry:
}
define <4 x i32> @test_vmlsq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlsq_lane_s32:
+; CHECK-LABEL: test_vmlsq_lane_s32:
; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
; CHECK-NEXT: ret
entry:
@@ -175,7 +175,7 @@ entry:
}
define <4 x i16> @test_vmls_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmls_laneq_s16:
+; CHECK-LABEL: test_vmls_laneq_s16:
; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
; CHECK-NEXT: ret
entry:
@@ -186,7 +186,7 @@ entry:
}
define <8 x i16> @test_vmlsq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlsq_laneq_s16:
+; CHECK-LABEL: test_vmlsq_laneq_s16:
; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
; CHECK-NEXT: ret
entry:
@@ -197,7 +197,7 @@ entry:
}
define <2 x i32> @test_vmls_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmls_laneq_s32:
+; CHECK-LABEL: test_vmls_laneq_s32:
; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
; CHECK-NEXT: ret
entry:
@@ -208,7 +208,7 @@ entry:
}
define <4 x i32> @test_vmlsq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlsq_laneq_s32:
+; CHECK-LABEL: test_vmlsq_laneq_s32:
; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
; CHECK-NEXT: ret
entry:
@@ -219,7 +219,7 @@ entry:
}
define <4 x i16> @test_vmul_lane_s16(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmul_lane_s16:
+; CHECK-LABEL: test_vmul_lane_s16:
; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
; CHECK-NEXT: ret
entry:
@@ -229,7 +229,7 @@ entry:
}
define <8 x i16> @test_vmulq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmulq_lane_s16:
+; CHECK-LABEL: test_vmulq_lane_s16:
; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
; CHECK-NEXT: ret
entry:
@@ -239,7 +239,7 @@ entry:
}
define <2 x i32> @test_vmul_lane_s32(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmul_lane_s32:
+; CHECK-LABEL: test_vmul_lane_s32:
; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
; CHECK-NEXT: ret
entry:
@@ -249,7 +249,7 @@ entry:
}
define <4 x i32> @test_vmulq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmulq_lane_s32:
+; CHECK-LABEL: test_vmulq_lane_s32:
; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
; CHECK-NEXT: ret
entry:
@@ -259,7 +259,7 @@ entry:
}
define <4 x i16> @test_vmul_lane_u16(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmul_lane_u16:
+; CHECK-LABEL: test_vmul_lane_u16:
; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
; CHECK-NEXT: ret
entry:
@@ -269,7 +269,7 @@ entry:
}
define <8 x i16> @test_vmulq_lane_u16(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmulq_lane_u16:
+; CHECK-LABEL: test_vmulq_lane_u16:
; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
; CHECK-NEXT: ret
entry:
@@ -279,7 +279,7 @@ entry:
}
define <2 x i32> @test_vmul_lane_u32(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmul_lane_u32:
+; CHECK-LABEL: test_vmul_lane_u32:
; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
; CHECK-NEXT: ret
entry:
@@ -289,7 +289,7 @@ entry:
}
define <4 x i32> @test_vmulq_lane_u32(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmulq_lane_u32:
+; CHECK-LABEL: test_vmulq_lane_u32:
; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
; CHECK-NEXT: ret
entry:
@@ -299,7 +299,7 @@ entry:
}
define <4 x i16> @test_vmul_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmul_laneq_s16:
+; CHECK-LABEL: test_vmul_laneq_s16:
; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
; CHECK-NEXT: ret
entry:
@@ -309,7 +309,7 @@ entry:
}
define <8 x i16> @test_vmulq_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmulq_laneq_s16:
+; CHECK-LABEL: test_vmulq_laneq_s16:
; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
; CHECK-NEXT: ret
entry:
@@ -319,7 +319,7 @@ entry:
}
define <2 x i32> @test_vmul_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmul_laneq_s32:
+; CHECK-LABEL: test_vmul_laneq_s32:
; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
; CHECK-NEXT: ret
entry:
@@ -329,7 +329,7 @@ entry:
}
define <4 x i32> @test_vmulq_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmulq_laneq_s32:
+; CHECK-LABEL: test_vmulq_laneq_s32:
; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
; CHECK-NEXT: ret
entry:
@@ -339,7 +339,7 @@ entry:
}
define <4 x i16> @test_vmul_laneq_u16(<4 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmul_laneq_u16:
+; CHECK-LABEL: test_vmul_laneq_u16:
; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
; CHECK-NEXT: ret
entry:
@@ -349,7 +349,7 @@ entry:
}
define <8 x i16> @test_vmulq_laneq_u16(<8 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmulq_laneq_u16:
+; CHECK-LABEL: test_vmulq_laneq_u16:
; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
; CHECK-NEXT: ret
entry:
@@ -359,7 +359,7 @@ entry:
}
define <2 x i32> @test_vmul_laneq_u32(<2 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmul_laneq_u32:
+; CHECK-LABEL: test_vmul_laneq_u32:
; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
; CHECK-NEXT: ret
entry:
@@ -369,7 +369,7 @@ entry:
}
define <4 x i32> @test_vmulq_laneq_u32(<4 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmulq_laneq_u32:
+; CHECK-LABEL: test_vmulq_laneq_u32:
; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
; CHECK-NEXT: ret
entry:
@@ -379,7 +379,7 @@ entry:
}
define <2 x float> @test_vfma_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
-; CHECK: test_vfma_lane_f32:
+; CHECK-LABEL: test_vfma_lane_f32:
; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
; CHECK-NEXT: ret
entry:
@@ -391,7 +391,7 @@ entry:
declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
define <4 x float> @test_vfmaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
-; CHECK: test_vfmaq_lane_f32:
+; CHECK-LABEL: test_vfmaq_lane_f32:
; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
; CHECK-NEXT: ret
entry:
@@ -403,7 +403,7 @@ entry:
declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
define <2 x float> @test_vfma_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
-; CHECK: test_vfma_laneq_f32:
+; CHECK-LABEL: test_vfma_laneq_f32:
; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
; CHECK-NEXT: ret
entry:
@@ -413,7 +413,7 @@ entry:
}
define <4 x float> @test_vfmaq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
-; CHECK: test_vfmaq_laneq_f32:
+; CHECK-LABEL: test_vfmaq_laneq_f32:
; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
; CHECK-NEXT: ret
entry:
@@ -423,7 +423,7 @@ entry:
}
define <2 x float> @test_vfms_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
-; CHECK: test_vfms_lane_f32:
+; CHECK-LABEL: test_vfms_lane_f32:
; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
; CHECK-NEXT: ret
entry:
@@ -434,7 +434,7 @@ entry:
}
define <4 x float> @test_vfmsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
-; CHECK: test_vfmsq_lane_f32:
+; CHECK-LABEL: test_vfmsq_lane_f32:
; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
; CHECK-NEXT: ret
entry:
@@ -445,7 +445,7 @@ entry:
}
define <2 x float> @test_vfms_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
-; CHECK: test_vfms_laneq_f32:
+; CHECK-LABEL: test_vfms_laneq_f32:
; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
; CHECK-NEXT: ret
entry:
@@ -456,7 +456,7 @@ entry:
}
define <4 x float> @test_vfmsq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
-; CHECK: test_vfmsq_laneq_f32:
+; CHECK-LABEL: test_vfmsq_laneq_f32:
; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
; CHECK-NEXT: ret
entry:
@@ -467,7 +467,7 @@ entry:
}
define <2 x double> @test_vfmaq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) {
-; CHECK: test_vfmaq_lane_f64:
+; CHECK-LABEL: test_vfmaq_lane_f64:
; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
; CHECK-NEXT: ret
entry:
@@ -479,7 +479,7 @@ entry:
declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>)
define <2 x double> @test_vfmaq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
-; CHECK: test_vfmaq_laneq_f64:
+; CHECK-LABEL: test_vfmaq_laneq_f64:
; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
; CHECK-NEXT: ret
entry:
@@ -489,7 +489,7 @@ entry:
}
define <2 x double> @test_vfmsq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) {
-; CHECK: test_vfmsq_lane_f64:
+; CHECK-LABEL: test_vfmsq_lane_f64:
; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
; CHECK-NEXT: ret
entry:
@@ -500,7 +500,7 @@ entry:
}
define <2 x double> @test_vfmsq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
-; CHECK: test_vfmsq_laneq_f64:
+; CHECK-LABEL: test_vfmsq_laneq_f64:
; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
; CHECK-NEXT: ret
entry:
@@ -524,7 +524,7 @@ declare float @llvm.fma.f32(float, float, float)
define double @test_vfmsd_lane_f64(double %a, double %b, <1 x double> %v) {
; CHECK-LABEL: test_vfmsd_lane_f64
-; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
+; CHECK: fmsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
; CHECK-NEXT: ret
entry:
%extract.rhs = extractelement <1 x double> %v, i32 0
@@ -536,7 +536,7 @@ entry:
declare double @llvm.fma.f64(double, double, double)
define float @test_vfmss_laneq_f32(float %a, float %b, <4 x float> %v) {
-; CHECK: test_vfmss_laneq_f32
+; CHECK-LABEL: test_vfmss_laneq_f32
; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
; CHECK-NEXT: ret
entry:
@@ -558,799 +558,799 @@ entry:
}
define <4 x i32> @test_vmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlal_lane_s16:
+; CHECK-LABEL: test_vmlal_lane_s16:
; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
%add = add <4 x i32> %vmull2.i, %a
ret <4 x i32> %add
}
define <2 x i64> @test_vmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlal_lane_s32:
+; CHECK-LABEL: test_vmlal_lane_s32:
; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
%add = add <2 x i64> %vmull2.i, %a
ret <2 x i64> %add
}
define <4 x i32> @test_vmlal_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlal_laneq_s16:
+; CHECK-LABEL: test_vmlal_laneq_s16:
; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
%add = add <4 x i32> %vmull2.i, %a
ret <4 x i32> %add
}
define <2 x i64> @test_vmlal_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlal_laneq_s32:
+; CHECK-LABEL: test_vmlal_laneq_s32:
; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
%add = add <2 x i64> %vmull2.i, %a
ret <2 x i64> %add
}
define <4 x i32> @test_vmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlal_high_lane_s16:
+; CHECK-LABEL: test_vmlal_high_lane_s16:
; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
%add = add <4 x i32> %vmull2.i, %a
ret <4 x i32> %add
}
define <2 x i64> @test_vmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlal_high_lane_s32:
+; CHECK-LABEL: test_vmlal_high_lane_s32:
; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
%add = add <2 x i64> %vmull2.i, %a
ret <2 x i64> %add
}
define <4 x i32> @test_vmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlal_high_laneq_s16:
+; CHECK-LABEL: test_vmlal_high_laneq_s16:
; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
%add = add <4 x i32> %vmull2.i, %a
ret <4 x i32> %add
}
define <2 x i64> @test_vmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlal_high_laneq_s32:
+; CHECK-LABEL: test_vmlal_high_laneq_s32:
; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
%add = add <2 x i64> %vmull2.i, %a
ret <2 x i64> %add
}
define <4 x i32> @test_vmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlsl_lane_s16:
+; CHECK-LABEL: test_vmlsl_lane_s16:
; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
%sub = sub <4 x i32> %a, %vmull2.i
ret <4 x i32> %sub
}
define <2 x i64> @test_vmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlsl_lane_s32:
+; CHECK-LABEL: test_vmlsl_lane_s32:
; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
%sub = sub <2 x i64> %a, %vmull2.i
ret <2 x i64> %sub
}
define <4 x i32> @test_vmlsl_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlsl_laneq_s16:
+; CHECK-LABEL: test_vmlsl_laneq_s16:
; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
%sub = sub <4 x i32> %a, %vmull2.i
ret <4 x i32> %sub
}
define <2 x i64> @test_vmlsl_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlsl_laneq_s32:
+; CHECK-LABEL: test_vmlsl_laneq_s32:
; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
%sub = sub <2 x i64> %a, %vmull2.i
ret <2 x i64> %sub
}
define <4 x i32> @test_vmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlsl_high_lane_s16:
+; CHECK-LABEL: test_vmlsl_high_lane_s16:
; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
%sub = sub <4 x i32> %a, %vmull2.i
ret <4 x i32> %sub
}
define <2 x i64> @test_vmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlsl_high_lane_s32:
+; CHECK-LABEL: test_vmlsl_high_lane_s32:
; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
%sub = sub <2 x i64> %a, %vmull2.i
ret <2 x i64> %sub
}
define <4 x i32> @test_vmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlsl_high_laneq_s16:
+; CHECK-LABEL: test_vmlsl_high_laneq_s16:
; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
%sub = sub <4 x i32> %a, %vmull2.i
ret <4 x i32> %sub
}
define <2 x i64> @test_vmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlsl_high_laneq_s32:
+; CHECK-LABEL: test_vmlsl_high_laneq_s32:
; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
%sub = sub <2 x i64> %a, %vmull2.i
ret <2 x i64> %sub
}
define <4 x i32> @test_vmlal_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlal_lane_u16:
+; CHECK-LABEL: test_vmlal_lane_u16:
; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
%add = add <4 x i32> %vmull2.i, %a
ret <4 x i32> %add
}
define <2 x i64> @test_vmlal_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlal_lane_u32:
+; CHECK-LABEL: test_vmlal_lane_u32:
; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
%add = add <2 x i64> %vmull2.i, %a
ret <2 x i64> %add
}
define <4 x i32> @test_vmlal_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlal_laneq_u16:
+; CHECK-LABEL: test_vmlal_laneq_u16:
; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
%add = add <4 x i32> %vmull2.i, %a
ret <4 x i32> %add
}
define <2 x i64> @test_vmlal_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlal_laneq_u32:
+; CHECK-LABEL: test_vmlal_laneq_u32:
; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
%add = add <2 x i64> %vmull2.i, %a
ret <2 x i64> %add
}
define <4 x i32> @test_vmlal_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlal_high_lane_u16:
+; CHECK-LABEL: test_vmlal_high_lane_u16:
; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
%add = add <4 x i32> %vmull2.i, %a
ret <4 x i32> %add
}
define <2 x i64> @test_vmlal_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlal_high_lane_u32:
+; CHECK-LABEL: test_vmlal_high_lane_u32:
; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
%add = add <2 x i64> %vmull2.i, %a
ret <2 x i64> %add
}
define <4 x i32> @test_vmlal_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlal_high_laneq_u16:
+; CHECK-LABEL: test_vmlal_high_laneq_u16:
; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
%add = add <4 x i32> %vmull2.i, %a
ret <4 x i32> %add
}
define <2 x i64> @test_vmlal_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlal_high_laneq_u32:
+; CHECK-LABEL: test_vmlal_high_laneq_u32:
; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
%add = add <2 x i64> %vmull2.i, %a
ret <2 x i64> %add
}
define <4 x i32> @test_vmlsl_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlsl_lane_u16:
+; CHECK-LABEL: test_vmlsl_lane_u16:
; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
%sub = sub <4 x i32> %a, %vmull2.i
ret <4 x i32> %sub
}
define <2 x i64> @test_vmlsl_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlsl_lane_u32:
+; CHECK-LABEL: test_vmlsl_lane_u32:
; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
%sub = sub <2 x i64> %a, %vmull2.i
ret <2 x i64> %sub
}
define <4 x i32> @test_vmlsl_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlsl_laneq_u16:
+; CHECK-LABEL: test_vmlsl_laneq_u16:
; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
%sub = sub <4 x i32> %a, %vmull2.i
ret <4 x i32> %sub
}
define <2 x i64> @test_vmlsl_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlsl_laneq_u32:
+; CHECK-LABEL: test_vmlsl_laneq_u32:
; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
%sub = sub <2 x i64> %a, %vmull2.i
ret <2 x i64> %sub
}
define <4 x i32> @test_vmlsl_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlsl_high_lane_u16:
+; CHECK-LABEL: test_vmlsl_high_lane_u16:
; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
%sub = sub <4 x i32> %a, %vmull2.i
ret <4 x i32> %sub
}
define <2 x i64> @test_vmlsl_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlsl_high_lane_u32:
+; CHECK-LABEL: test_vmlsl_high_lane_u32:
; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
%sub = sub <2 x i64> %a, %vmull2.i
ret <2 x i64> %sub
}
define <4 x i32> @test_vmlsl_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlsl_high_laneq_u16:
+; CHECK-LABEL: test_vmlsl_high_laneq_u16:
; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
%sub = sub <4 x i32> %a, %vmull2.i
ret <4 x i32> %sub
}
define <2 x i64> @test_vmlsl_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlsl_high_laneq_u32:
+; CHECK-LABEL: test_vmlsl_high_laneq_u32:
; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
%sub = sub <2 x i64> %a, %vmull2.i
ret <2 x i64> %sub
}
define <4 x i32> @test_vmull_lane_s16(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmull_lane_s16:
+; CHECK-LABEL: test_vmull_lane_s16:
; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
ret <4 x i32> %vmull2.i
}
define <2 x i64> @test_vmull_lane_s32(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmull_lane_s32:
+; CHECK-LABEL: test_vmull_lane_s32:
; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
ret <2 x i64> %vmull2.i
}
define <4 x i32> @test_vmull_lane_u16(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmull_lane_u16:
+; CHECK-LABEL: test_vmull_lane_u16:
; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
ret <4 x i32> %vmull2.i
}
define <2 x i64> @test_vmull_lane_u32(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmull_lane_u32:
+; CHECK-LABEL: test_vmull_lane_u32:
; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
ret <2 x i64> %vmull2.i
}
define <4 x i32> @test_vmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmull_high_lane_s16:
+; CHECK-LABEL: test_vmull_high_lane_s16:
; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
ret <4 x i32> %vmull2.i
}
define <2 x i64> @test_vmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmull_high_lane_s32:
+; CHECK-LABEL: test_vmull_high_lane_s32:
; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
ret <2 x i64> %vmull2.i
}
define <4 x i32> @test_vmull_high_lane_u16(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmull_high_lane_u16:
+; CHECK-LABEL: test_vmull_high_lane_u16:
; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
ret <4 x i32> %vmull2.i
}
define <2 x i64> @test_vmull_high_lane_u32(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmull_high_lane_u32:
+; CHECK-LABEL: test_vmull_high_lane_u32:
; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
ret <2 x i64> %vmull2.i
}
define <4 x i32> @test_vmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmull_laneq_s16:
+; CHECK-LABEL: test_vmull_laneq_s16:
; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
ret <4 x i32> %vmull2.i
}
define <2 x i64> @test_vmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmull_laneq_s32:
+; CHECK-LABEL: test_vmull_laneq_s32:
; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
ret <2 x i64> %vmull2.i
}
define <4 x i32> @test_vmull_laneq_u16(<4 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmull_laneq_u16:
+; CHECK-LABEL: test_vmull_laneq_u16:
; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
ret <4 x i32> %vmull2.i
}
define <2 x i64> @test_vmull_laneq_u32(<2 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmull_laneq_u32:
+; CHECK-LABEL: test_vmull_laneq_u32:
; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
ret <2 x i64> %vmull2.i
}
define <4 x i32> @test_vmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmull_high_laneq_s16:
+; CHECK-LABEL: test_vmull_high_laneq_s16:
; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
ret <4 x i32> %vmull2.i
}
define <2 x i64> @test_vmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmull_high_laneq_s32:
+; CHECK-LABEL: test_vmull_high_laneq_s32:
; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
ret <2 x i64> %vmull2.i
}
define <4 x i32> @test_vmull_high_laneq_u16(<8 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmull_high_laneq_u16:
+; CHECK-LABEL: test_vmull_high_laneq_u16:
; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
ret <4 x i32> %vmull2.i
}
define <2 x i64> @test_vmull_high_laneq_u32(<4 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmull_high_laneq_u32:
+; CHECK-LABEL: test_vmull_high_laneq_u32:
; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
ret <2 x i64> %vmull2.i
}
define <4 x i32> @test_vqdmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vqdmlal_lane_s16:
+; CHECK-LABEL: test_vqdmlal_lane_s16:
; CHECK: qdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
- %vqdmlal2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
- %vqdmlal4.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
+ %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+ %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
ret <4 x i32> %vqdmlal4.i
}
define <2 x i64> @test_vqdmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vqdmlal_lane_s32:
+; CHECK-LABEL: test_vqdmlal_lane_s32:
; CHECK: qdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
- %vqdmlal2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
- %vqdmlal4.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
+ %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+ %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
ret <2 x i64> %vqdmlal4.i
}
define <4 x i32> @test_vqdmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vqdmlal_high_lane_s16:
+; CHECK-LABEL: test_vqdmlal_high_lane_s16:
; CHECK: qdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
- %vqdmlal2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
- %vqdmlal4.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
+ %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+ %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
ret <4 x i32> %vqdmlal4.i
}
define <2 x i64> @test_vqdmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vqdmlal_high_lane_s32:
+; CHECK-LABEL: test_vqdmlal_high_lane_s32:
; CHECK: qdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
- %vqdmlal2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
- %vqdmlal4.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
+ %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+ %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
ret <2 x i64> %vqdmlal4.i
}
define <4 x i32> @test_vqdmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vqdmlsl_lane_s16:
+; CHECK-LABEL: test_vqdmlsl_lane_s16:
; CHECK: qdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
- %vqdmlsl2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
- %vqdmlsl4.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
+ %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+ %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
ret <4 x i32> %vqdmlsl4.i
}
define <2 x i64> @test_vqdmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vqdmlsl_lane_s32:
+; CHECK-LABEL: test_vqdmlsl_lane_s32:
; CHECK: qdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
- %vqdmlsl2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
- %vqdmlsl4.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
+ %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+ %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
ret <2 x i64> %vqdmlsl4.i
}
define <4 x i32> @test_vqdmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vqdmlsl_high_lane_s16:
+; CHECK-LABEL: test_vqdmlsl_high_lane_s16:
; CHECK: qdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
- %vqdmlsl2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
- %vqdmlsl4.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
+ %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+ %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
ret <4 x i32> %vqdmlsl4.i
}
define <2 x i64> @test_vqdmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vqdmlsl_high_lane_s32:
+; CHECK-LABEL: test_vqdmlsl_high_lane_s32:
; CHECK: qdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
- %vqdmlsl2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
- %vqdmlsl4.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
+ %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+ %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
ret <2 x i64> %vqdmlsl4.i
}
define <4 x i32> @test_vqdmull_lane_s16(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vqdmull_lane_s16:
+; CHECK-LABEL: test_vqdmull_lane_s16:
; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
- %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+ %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
ret <4 x i32> %vqdmull2.i
}
define <2 x i64> @test_vqdmull_lane_s32(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vqdmull_lane_s32:
+; CHECK-LABEL: test_vqdmull_lane_s32:
; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
- %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+ %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
ret <2 x i64> %vqdmull2.i
}
define <4 x i32> @test_vqdmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vqdmull_laneq_s16:
+; CHECK-LABEL: test_vqdmull_laneq_s16:
; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
- %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+ %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
ret <4 x i32> %vqdmull2.i
}
define <2 x i64> @test_vqdmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vqdmull_laneq_s32:
+; CHECK-LABEL: test_vqdmull_laneq_s32:
; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
- %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+ %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
ret <2 x i64> %vqdmull2.i
}
define <4 x i32> @test_vqdmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vqdmull_high_lane_s16:
+; CHECK-LABEL: test_vqdmull_high_lane_s16:
; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
- %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+ %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
ret <4 x i32> %vqdmull2.i
}
define <2 x i64> @test_vqdmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vqdmull_high_lane_s32:
+; CHECK-LABEL: test_vqdmull_high_lane_s32:
; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
- %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+ %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
ret <2 x i64> %vqdmull2.i
}
define <4 x i32> @test_vqdmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vqdmull_high_laneq_s16:
+; CHECK-LABEL: test_vqdmull_high_laneq_s16:
; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
- %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+ %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
ret <4 x i32> %vqdmull2.i
}
define <2 x i64> @test_vqdmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vqdmull_high_laneq_s32:
+; CHECK-LABEL: test_vqdmull_high_laneq_s32:
; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
- %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+ %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
ret <2 x i64> %vqdmull2.i
}
define <4 x i16> @test_vqdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vqdmulh_lane_s16:
+; CHECK-LABEL: test_vqdmulh_lane_s16:
; CHECK: qdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
- %vqdmulh2.i = tail call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
+ %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
ret <4 x i16> %vqdmulh2.i
}
define <8 x i16> @test_vqdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vqdmulhq_lane_s16:
+; CHECK-LABEL: test_vqdmulhq_lane_s16:
; CHECK: qdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
- %vqdmulh2.i = tail call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
+ %vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
ret <8 x i16> %vqdmulh2.i
}
define <2 x i32> @test_vqdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vqdmulh_lane_s32:
+; CHECK-LABEL: test_vqdmulh_lane_s32:
; CHECK: qdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
- %vqdmulh2.i = tail call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
+ %vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
ret <2 x i32> %vqdmulh2.i
}
define <4 x i32> @test_vqdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vqdmulhq_lane_s32:
+; CHECK-LABEL: test_vqdmulhq_lane_s32:
; CHECK: qdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
- %vqdmulh2.i = tail call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
+ %vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
ret <4 x i32> %vqdmulh2.i
}
define <4 x i16> @test_vqrdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vqrdmulh_lane_s16:
+; CHECK-LABEL: test_vqrdmulh_lane_s16:
; CHECK: qrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
- %vqrdmulh2.i = tail call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
+ %vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
ret <4 x i16> %vqrdmulh2.i
}
define <8 x i16> @test_vqrdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vqrdmulhq_lane_s16:
+; CHECK-LABEL: test_vqrdmulhq_lane_s16:
; CHECK: qrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
- %vqrdmulh2.i = tail call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
+ %vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
ret <8 x i16> %vqrdmulh2.i
}
define <2 x i32> @test_vqrdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vqrdmulh_lane_s32:
+; CHECK-LABEL: test_vqrdmulh_lane_s32:
; CHECK: qrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
- %vqrdmulh2.i = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
+ %vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
ret <2 x i32> %vqrdmulh2.i
}
define <4 x i32> @test_vqrdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vqrdmulhq_lane_s32:
+; CHECK-LABEL: test_vqrdmulhq_lane_s32:
; CHECK: qrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
- %vqrdmulh2.i = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
+ %vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
ret <4 x i32> %vqrdmulh2.i
}
define <2 x float> @test_vmul_lane_f32(<2 x float> %a, <2 x float> %v) {
-; CHECK: test_vmul_lane_f32:
+; CHECK-LABEL: test_vmul_lane_f32:
; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
; CHECK-NEXT: ret
entry:
@@ -1360,8 +1360,8 @@ entry:
}
define <1 x double> @test_vmul_lane_f64(<1 x double> %a, <1 x double> %v) {
-; CHECK: test_vmul_lane_f64:
-; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
+; CHECK-LABEL: test_vmul_lane_f64:
+; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
; CHECK-NEXT: ret
entry:
%0 = bitcast <1 x double> %a to <8 x i8>
@@ -1373,7 +1373,7 @@ entry:
}
define <4 x float> @test_vmulq_lane_f32(<4 x float> %a, <2 x float> %v) {
-; CHECK: test_vmulq_lane_f32:
+; CHECK-LABEL: test_vmulq_lane_f32:
; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
; CHECK-NEXT: ret
entry:
@@ -1383,7 +1383,7 @@ entry:
}
define <2 x double> @test_vmulq_lane_f64(<2 x double> %a, <1 x double> %v) {
-; CHECK: test_vmulq_lane_f64:
+; CHECK-LABEL: test_vmulq_lane_f64:
; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
; CHECK-NEXT: ret
entry:
@@ -1393,7 +1393,7 @@ entry:
}
define <2 x float> @test_vmul_laneq_f32(<2 x float> %a, <4 x float> %v) {
-; CHECK: test_vmul_laneq_f32:
+; CHECK-LABEL: test_vmul_laneq_f32:
; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
; CHECK-NEXT: ret
entry:
@@ -1403,7 +1403,7 @@ entry:
}
define <1 x double> @test_vmul_laneq_f64(<1 x double> %a, <2 x double> %v) {
-; CHECK: test_vmul_laneq_f64:
+; CHECK-LABEL: test_vmul_laneq_f64:
; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
; CHECK-NEXT: ret
entry:
@@ -1416,7 +1416,7 @@ entry:
}
define <4 x float> @test_vmulq_laneq_f32(<4 x float> %a, <4 x float> %v) {
-; CHECK: test_vmulq_laneq_f32:
+; CHECK-LABEL: test_vmulq_laneq_f32:
; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
; CHECK-NEXT: ret
entry:
@@ -1426,7 +1426,7 @@ entry:
}
define <2 x double> @test_vmulq_laneq_f64(<2 x double> %a, <2 x double> %v) {
-; CHECK: test_vmulq_laneq_f64:
+; CHECK-LABEL: test_vmulq_laneq_f64:
; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
; CHECK-NEXT: ret
entry:
@@ -1436,67 +1436,67 @@ entry:
}
define <2 x float> @test_vmulx_lane_f32(<2 x float> %a, <2 x float> %v) {
-; CHECK: test_vmulx_lane_f32:
+; CHECK-LABEL: test_vmulx_lane_f32:
; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
- %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
+ %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
ret <2 x float> %vmulx2.i
}
define <4 x float> @test_vmulxq_lane_f32(<4 x float> %a, <2 x float> %v) {
-; CHECK: test_vmulxq_lane_f32:
+; CHECK-LABEL: test_vmulxq_lane_f32:
; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
- %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
+ %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
ret <4 x float> %vmulx2.i
}
define <2 x double> @test_vmulxq_lane_f64(<2 x double> %a, <1 x double> %v) {
-; CHECK: test_vmulxq_lane_f64:
+; CHECK-LABEL: test_vmulxq_lane_f64:
; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
- %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
+ %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
ret <2 x double> %vmulx2.i
}
define <2 x float> @test_vmulx_laneq_f32(<2 x float> %a, <4 x float> %v) {
-; CHECK: test_vmulx_laneq_f32:
+; CHECK-LABEL: test_vmulx_laneq_f32:
; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
- %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
+ %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
ret <2 x float> %vmulx2.i
}
define <4 x float> @test_vmulxq_laneq_f32(<4 x float> %a, <4 x float> %v) {
-; CHECK: test_vmulxq_laneq_f32:
+; CHECK-LABEL: test_vmulxq_laneq_f32:
; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
- %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
+ %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
ret <4 x float> %vmulx2.i
}
define <2 x double> @test_vmulxq_laneq_f64(<2 x double> %a, <2 x double> %v) {
-; CHECK: test_vmulxq_laneq_f64:
+; CHECK-LABEL: test_vmulxq_laneq_f64:
; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
- %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
+ %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
ret <2 x double> %vmulx2.i
}
define <4 x i16> @test_vmla_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmla_lane_s16_0:
+; CHECK-LABEL: test_vmla_lane_s16_0:
; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
@@ -1507,7 +1507,7 @@ entry:
}
define <8 x i16> @test_vmlaq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlaq_lane_s16_0:
+; CHECK-LABEL: test_vmlaq_lane_s16_0:
; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
@@ -1518,7 +1518,7 @@ entry:
}
define <2 x i32> @test_vmla_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmla_lane_s32_0:
+; CHECK-LABEL: test_vmla_lane_s32_0:
; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
@@ -1529,7 +1529,7 @@ entry:
}
define <4 x i32> @test_vmlaq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlaq_lane_s32_0:
+; CHECK-LABEL: test_vmlaq_lane_s32_0:
; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
@@ -1540,7 +1540,7 @@ entry:
}
define <4 x i16> @test_vmla_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmla_laneq_s16_0:
+; CHECK-LABEL: test_vmla_laneq_s16_0:
; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
@@ -1551,7 +1551,7 @@ entry:
}
define <8 x i16> @test_vmlaq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlaq_laneq_s16_0:
+; CHECK-LABEL: test_vmlaq_laneq_s16_0:
; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
@@ -1562,7 +1562,7 @@ entry:
}
define <2 x i32> @test_vmla_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmla_laneq_s32_0:
+; CHECK-LABEL: test_vmla_laneq_s32_0:
; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
@@ -1573,7 +1573,7 @@ entry:
}
define <4 x i32> @test_vmlaq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlaq_laneq_s32_0:
+; CHECK-LABEL: test_vmlaq_laneq_s32_0:
; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
@@ -1584,7 +1584,7 @@ entry:
}
define <4 x i16> @test_vmls_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmls_lane_s16_0:
+; CHECK-LABEL: test_vmls_lane_s16_0:
; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
@@ -1595,7 +1595,7 @@ entry:
}
define <8 x i16> @test_vmlsq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlsq_lane_s16_0:
+; CHECK-LABEL: test_vmlsq_lane_s16_0:
; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
@@ -1606,7 +1606,7 @@ entry:
}
define <2 x i32> @test_vmls_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmls_lane_s32_0:
+; CHECK-LABEL: test_vmls_lane_s32_0:
; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
@@ -1617,7 +1617,7 @@ entry:
}
define <4 x i32> @test_vmlsq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlsq_lane_s32_0:
+; CHECK-LABEL: test_vmlsq_lane_s32_0:
; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
@@ -1628,7 +1628,7 @@ entry:
}
define <4 x i16> @test_vmls_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmls_laneq_s16_0:
+; CHECK-LABEL: test_vmls_laneq_s16_0:
; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
@@ -1639,7 +1639,7 @@ entry:
}
define <8 x i16> @test_vmlsq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlsq_laneq_s16_0:
+; CHECK-LABEL: test_vmlsq_laneq_s16_0:
; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
@@ -1650,7 +1650,7 @@ entry:
}
define <2 x i32> @test_vmls_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmls_laneq_s32_0:
+; CHECK-LABEL: test_vmls_laneq_s32_0:
; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
@@ -1661,7 +1661,7 @@ entry:
}
define <4 x i32> @test_vmlsq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlsq_laneq_s32_0:
+; CHECK-LABEL: test_vmlsq_laneq_s32_0:
; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
@@ -1672,7 +1672,7 @@ entry:
}
define <4 x i16> @test_vmul_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmul_lane_s16_0:
+; CHECK-LABEL: test_vmul_lane_s16_0:
; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
@@ -1682,7 +1682,7 @@ entry:
}
define <8 x i16> @test_vmulq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmulq_lane_s16_0:
+; CHECK-LABEL: test_vmulq_lane_s16_0:
; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
@@ -1692,7 +1692,7 @@ entry:
}
define <2 x i32> @test_vmul_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmul_lane_s32_0:
+; CHECK-LABEL: test_vmul_lane_s32_0:
; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
@@ -1702,7 +1702,7 @@ entry:
}
define <4 x i32> @test_vmulq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmulq_lane_s32_0:
+; CHECK-LABEL: test_vmulq_lane_s32_0:
; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
@@ -1712,7 +1712,7 @@ entry:
}
define <4 x i16> @test_vmul_lane_u16_0(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmul_lane_u16_0:
+; CHECK-LABEL: test_vmul_lane_u16_0:
; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
@@ -1722,7 +1722,7 @@ entry:
}
define <8 x i16> @test_vmulq_lane_u16_0(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmulq_lane_u16_0:
+; CHECK-LABEL: test_vmulq_lane_u16_0:
; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
@@ -1732,7 +1732,7 @@ entry:
}
define <2 x i32> @test_vmul_lane_u32_0(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmul_lane_u32_0:
+; CHECK-LABEL: test_vmul_lane_u32_0:
; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
@@ -1742,7 +1742,7 @@ entry:
}
define <4 x i32> @test_vmulq_lane_u32_0(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmulq_lane_u32_0:
+; CHECK-LABEL: test_vmulq_lane_u32_0:
; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
@@ -1752,7 +1752,7 @@ entry:
}
define <4 x i16> @test_vmul_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmul_laneq_s16_0:
+; CHECK-LABEL: test_vmul_laneq_s16_0:
; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
@@ -1762,7 +1762,7 @@ entry:
}
define <8 x i16> @test_vmulq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmulq_laneq_s16_0:
+; CHECK-LABEL: test_vmulq_laneq_s16_0:
; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
@@ -1772,7 +1772,7 @@ entry:
}
define <2 x i32> @test_vmul_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmul_laneq_s32_0:
+; CHECK-LABEL: test_vmul_laneq_s32_0:
; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
@@ -1782,7 +1782,7 @@ entry:
}
define <4 x i32> @test_vmulq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmulq_laneq_s32_0:
+; CHECK-LABEL: test_vmulq_laneq_s32_0:
; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
@@ -1792,7 +1792,7 @@ entry:
}
define <4 x i16> @test_vmul_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmul_laneq_u16_0:
+; CHECK-LABEL: test_vmul_laneq_u16_0:
; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
@@ -1802,7 +1802,7 @@ entry:
}
define <8 x i16> @test_vmulq_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmulq_laneq_u16_0:
+; CHECK-LABEL: test_vmulq_laneq_u16_0:
; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
@@ -1812,7 +1812,7 @@ entry:
}
define <2 x i32> @test_vmul_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmul_laneq_u32_0:
+; CHECK-LABEL: test_vmul_laneq_u32_0:
; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
@@ -1822,7 +1822,7 @@ entry:
}
define <4 x i32> @test_vmulq_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmulq_laneq_u32_0:
+; CHECK-LABEL: test_vmulq_laneq_u32_0:
; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
@@ -1832,7 +1832,7 @@ entry:
}
define <2 x float> @test_vfma_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
-; CHECK: test_vfma_lane_f32_0:
+; CHECK-LABEL: test_vfma_lane_f32_0:
; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
@@ -1842,7 +1842,7 @@ entry:
}
define <4 x float> @test_vfmaq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
-; CHECK: test_vfmaq_lane_f32_0:
+; CHECK-LABEL: test_vfmaq_lane_f32_0:
; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
@@ -1852,7 +1852,7 @@ entry:
}
define <2 x float> @test_vfma_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
-; CHECK: test_vfma_laneq_f32_0:
+; CHECK-LABEL: test_vfma_laneq_f32_0:
; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
@@ -1862,7 +1862,7 @@ entry:
}
define <4 x float> @test_vfmaq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
-; CHECK: test_vfmaq_laneq_f32_0:
+; CHECK-LABEL: test_vfmaq_laneq_f32_0:
; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
@@ -1872,7 +1872,7 @@ entry:
}
define <2 x float> @test_vfms_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
-; CHECK: test_vfms_lane_f32_0:
+; CHECK-LABEL: test_vfms_lane_f32_0:
; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
@@ -1883,7 +1883,7 @@ entry:
}
define <4 x float> @test_vfmsq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
-; CHECK: test_vfmsq_lane_f32_0:
+; CHECK-LABEL: test_vfmsq_lane_f32_0:
; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
@@ -1894,7 +1894,7 @@ entry:
}
define <2 x float> @test_vfms_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
-; CHECK: test_vfms_laneq_f32_0:
+; CHECK-LABEL: test_vfms_laneq_f32_0:
; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
@@ -1905,7 +1905,7 @@ entry:
}
define <4 x float> @test_vfmsq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
-; CHECK: test_vfmsq_laneq_f32_0:
+; CHECK-LABEL: test_vfmsq_laneq_f32_0:
; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
@@ -1916,7 +1916,7 @@ entry:
}
define <2 x double> @test_vfmaq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
-; CHECK: test_vfmaq_laneq_f64_0:
+; CHECK-LABEL: test_vfmaq_laneq_f64_0:
; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
; CHECK-NEXT: ret
entry:
@@ -1926,7 +1926,7 @@ entry:
}
define <2 x double> @test_vfmsq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
-; CHECK: test_vfmsq_laneq_f64_0:
+; CHECK-LABEL: test_vfmsq_laneq_f64_0:
; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
; CHECK-NEXT: ret
entry:
@@ -1937,799 +1937,799 @@ entry:
}
define <4 x i32> @test_vmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlal_lane_s16_0:
+; CHECK-LABEL: test_vmlal_lane_s16_0:
; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
%add = add <4 x i32> %vmull2.i, %a
ret <4 x i32> %add
}
define <2 x i64> @test_vmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlal_lane_s32_0:
+; CHECK-LABEL: test_vmlal_lane_s32_0:
; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
%add = add <2 x i64> %vmull2.i, %a
ret <2 x i64> %add
}
define <4 x i32> @test_vmlal_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlal_laneq_s16_0:
+; CHECK-LABEL: test_vmlal_laneq_s16_0:
; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
%add = add <4 x i32> %vmull2.i, %a
ret <4 x i32> %add
}
define <2 x i64> @test_vmlal_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlal_laneq_s32_0:
+; CHECK-LABEL: test_vmlal_laneq_s32_0:
; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
%add = add <2 x i64> %vmull2.i, %a
ret <2 x i64> %add
}
define <4 x i32> @test_vmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlal_high_lane_s16_0:
+; CHECK-LABEL: test_vmlal_high_lane_s16_0:
; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
%add = add <4 x i32> %vmull2.i, %a
ret <4 x i32> %add
}
define <2 x i64> @test_vmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlal_high_lane_s32_0:
+; CHECK-LABEL: test_vmlal_high_lane_s32_0:
; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
%add = add <2 x i64> %vmull2.i, %a
ret <2 x i64> %add
}
define <4 x i32> @test_vmlal_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlal_high_laneq_s16_0:
+; CHECK-LABEL: test_vmlal_high_laneq_s16_0:
; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
%add = add <4 x i32> %vmull2.i, %a
ret <4 x i32> %add
}
define <2 x i64> @test_vmlal_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlal_high_laneq_s32_0:
+; CHECK-LABEL: test_vmlal_high_laneq_s32_0:
; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
%add = add <2 x i64> %vmull2.i, %a
ret <2 x i64> %add
}
define <4 x i32> @test_vmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlsl_lane_s16_0:
+; CHECK-LABEL: test_vmlsl_lane_s16_0:
; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
%sub = sub <4 x i32> %a, %vmull2.i
ret <4 x i32> %sub
}
define <2 x i64> @test_vmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlsl_lane_s32_0:
+; CHECK-LABEL: test_vmlsl_lane_s32_0:
; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
%sub = sub <2 x i64> %a, %vmull2.i
ret <2 x i64> %sub
}
define <4 x i32> @test_vmlsl_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlsl_laneq_s16_0:
+; CHECK-LABEL: test_vmlsl_laneq_s16_0:
; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
%sub = sub <4 x i32> %a, %vmull2.i
ret <4 x i32> %sub
}
define <2 x i64> @test_vmlsl_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlsl_laneq_s32_0:
+; CHECK-LABEL: test_vmlsl_laneq_s32_0:
; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
%sub = sub <2 x i64> %a, %vmull2.i
ret <2 x i64> %sub
}
define <4 x i32> @test_vmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlsl_high_lane_s16_0:
+; CHECK-LABEL: test_vmlsl_high_lane_s16_0:
; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
%sub = sub <4 x i32> %a, %vmull2.i
ret <4 x i32> %sub
}
define <2 x i64> @test_vmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlsl_high_lane_s32_0:
+; CHECK-LABEL: test_vmlsl_high_lane_s32_0:
; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
%sub = sub <2 x i64> %a, %vmull2.i
ret <2 x i64> %sub
}
define <4 x i32> @test_vmlsl_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlsl_high_laneq_s16_0:
+; CHECK-LABEL: test_vmlsl_high_laneq_s16_0:
; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
%sub = sub <4 x i32> %a, %vmull2.i
ret <4 x i32> %sub
}
define <2 x i64> @test_vmlsl_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlsl_high_laneq_s32_0:
+; CHECK-LABEL: test_vmlsl_high_laneq_s32_0:
; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
%sub = sub <2 x i64> %a, %vmull2.i
ret <2 x i64> %sub
}
define <4 x i32> @test_vmlal_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlal_lane_u16_0:
+; CHECK-LABEL: test_vmlal_lane_u16_0:
; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
%add = add <4 x i32> %vmull2.i, %a
ret <4 x i32> %add
}
define <2 x i64> @test_vmlal_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlal_lane_u32_0:
+; CHECK-LABEL: test_vmlal_lane_u32_0:
; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
%add = add <2 x i64> %vmull2.i, %a
ret <2 x i64> %add
}
define <4 x i32> @test_vmlal_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlal_laneq_u16_0:
+; CHECK-LABEL: test_vmlal_laneq_u16_0:
; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
%add = add <4 x i32> %vmull2.i, %a
ret <4 x i32> %add
}
define <2 x i64> @test_vmlal_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlal_laneq_u32_0:
+; CHECK-LABEL: test_vmlal_laneq_u32_0:
; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
%add = add <2 x i64> %vmull2.i, %a
ret <2 x i64> %add
}
define <4 x i32> @test_vmlal_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlal_high_lane_u16_0:
+; CHECK-LABEL: test_vmlal_high_lane_u16_0:
; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
%add = add <4 x i32> %vmull2.i, %a
ret <4 x i32> %add
}
define <2 x i64> @test_vmlal_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlal_high_lane_u32_0:
+; CHECK-LABEL: test_vmlal_high_lane_u32_0:
; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
%add = add <2 x i64> %vmull2.i, %a
ret <2 x i64> %add
}
define <4 x i32> @test_vmlal_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlal_high_laneq_u16_0:
+; CHECK-LABEL: test_vmlal_high_laneq_u16_0:
; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
%add = add <4 x i32> %vmull2.i, %a
ret <4 x i32> %add
}
define <2 x i64> @test_vmlal_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlal_high_laneq_u32_0:
+; CHECK-LABEL: test_vmlal_high_laneq_u32_0:
; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
%add = add <2 x i64> %vmull2.i, %a
ret <2 x i64> %add
}
define <4 x i32> @test_vmlsl_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlsl_lane_u16_0:
+; CHECK-LABEL: test_vmlsl_lane_u16_0:
; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
%sub = sub <4 x i32> %a, %vmull2.i
ret <4 x i32> %sub
}
define <2 x i64> @test_vmlsl_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlsl_lane_u32_0:
+; CHECK-LABEL: test_vmlsl_lane_u32_0:
; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
%sub = sub <2 x i64> %a, %vmull2.i
ret <2 x i64> %sub
}
define <4 x i32> @test_vmlsl_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlsl_laneq_u16_0:
+; CHECK-LABEL: test_vmlsl_laneq_u16_0:
; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
%sub = sub <4 x i32> %a, %vmull2.i
ret <4 x i32> %sub
}
define <2 x i64> @test_vmlsl_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlsl_laneq_u32_0:
+; CHECK-LABEL: test_vmlsl_laneq_u32_0:
; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
%sub = sub <2 x i64> %a, %vmull2.i
ret <2 x i64> %sub
}
define <4 x i32> @test_vmlsl_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vmlsl_high_lane_u16_0:
+; CHECK-LABEL: test_vmlsl_high_lane_u16_0:
; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
%sub = sub <4 x i32> %a, %vmull2.i
ret <4 x i32> %sub
}
define <2 x i64> @test_vmlsl_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vmlsl_high_lane_u32_0:
+; CHECK-LABEL: test_vmlsl_high_lane_u32_0:
; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
%sub = sub <2 x i64> %a, %vmull2.i
ret <2 x i64> %sub
}
define <4 x i32> @test_vmlsl_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
-; CHECK: test_vmlsl_high_laneq_u16_0:
+; CHECK-LABEL: test_vmlsl_high_laneq_u16_0:
; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
%sub = sub <4 x i32> %a, %vmull2.i
ret <4 x i32> %sub
}
define <2 x i64> @test_vmlsl_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
-; CHECK: test_vmlsl_high_laneq_u32_0:
+; CHECK-LABEL: test_vmlsl_high_laneq_u32_0:
; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
%sub = sub <2 x i64> %a, %vmull2.i
ret <2 x i64> %sub
}
define <4 x i32> @test_vmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmull_lane_s16_0:
+; CHECK-LABEL: test_vmull_lane_s16_0:
; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
ret <4 x i32> %vmull2.i
}
define <2 x i64> @test_vmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmull_lane_s32_0:
+; CHECK-LABEL: test_vmull_lane_s32_0:
; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
ret <2 x i64> %vmull2.i
}
define <4 x i32> @test_vmull_lane_u16_0(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmull_lane_u16_0:
+; CHECK-LABEL: test_vmull_lane_u16_0:
; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
ret <4 x i32> %vmull2.i
}
define <2 x i64> @test_vmull_lane_u32_0(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmull_lane_u32_0:
+; CHECK-LABEL: test_vmull_lane_u32_0:
; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
ret <2 x i64> %vmull2.i
}
define <4 x i32> @test_vmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmull_high_lane_s16_0:
+; CHECK-LABEL: test_vmull_high_lane_s16_0:
; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
ret <4 x i32> %vmull2.i
}
define <2 x i64> @test_vmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmull_high_lane_s32_0:
+; CHECK-LABEL: test_vmull_high_lane_s32_0:
; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
ret <2 x i64> %vmull2.i
}
define <4 x i32> @test_vmull_high_lane_u16_0(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vmull_high_lane_u16_0:
+; CHECK-LABEL: test_vmull_high_lane_u16_0:
; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
ret <4 x i32> %vmull2.i
}
define <2 x i64> @test_vmull_high_lane_u32_0(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vmull_high_lane_u32_0:
+; CHECK-LABEL: test_vmull_high_lane_u32_0:
; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
ret <2 x i64> %vmull2.i
}
define <4 x i32> @test_vmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmull_laneq_s16_0:
+; CHECK-LABEL: test_vmull_laneq_s16_0:
; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
ret <4 x i32> %vmull2.i
}
define <2 x i64> @test_vmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmull_laneq_s32_0:
+; CHECK-LABEL: test_vmull_laneq_s32_0:
; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
ret <2 x i64> %vmull2.i
}
define <4 x i32> @test_vmull_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmull_laneq_u16_0:
+; CHECK-LABEL: test_vmull_laneq_u16_0:
; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
ret <4 x i32> %vmull2.i
}
define <2 x i64> @test_vmull_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmull_laneq_u32_0:
+; CHECK-LABEL: test_vmull_laneq_u32_0:
; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
ret <2 x i64> %vmull2.i
}
define <4 x i32> @test_vmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmull_high_laneq_s16_0:
+; CHECK-LABEL: test_vmull_high_laneq_s16_0:
; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
ret <4 x i32> %vmull2.i
}
define <2 x i64> @test_vmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmull_high_laneq_s32_0:
+; CHECK-LABEL: test_vmull_high_laneq_s32_0:
; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
ret <2 x i64> %vmull2.i
}
define <4 x i32> @test_vmull_high_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vmull_high_laneq_u16_0:
+; CHECK-LABEL: test_vmull_high_laneq_u16_0:
; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
ret <4 x i32> %vmull2.i
}
define <2 x i64> @test_vmull_high_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vmull_high_laneq_u32_0:
+; CHECK-LABEL: test_vmull_high_laneq_u32_0:
; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
ret <2 x i64> %vmull2.i
}
define <4 x i32> @test_vqdmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vqdmlal_lane_s16_0:
+; CHECK-LABEL: test_vqdmlal_lane_s16_0:
; CHECK: qdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
- %vqdmlal2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
- %vqdmlal4.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
+ %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+ %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
ret <4 x i32> %vqdmlal4.i
}
define <2 x i64> @test_vqdmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vqdmlal_lane_s32_0:
+; CHECK-LABEL: test_vqdmlal_lane_s32_0:
; CHECK: qdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
- %vqdmlal2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
- %vqdmlal4.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
+ %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+ %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
ret <2 x i64> %vqdmlal4.i
}
define <4 x i32> @test_vqdmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vqdmlal_high_lane_s16_0:
+; CHECK-LABEL: test_vqdmlal_high_lane_s16_0:
; CHECK: qdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
- %vqdmlal2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
- %vqdmlal4.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
+ %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+ %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
ret <4 x i32> %vqdmlal4.i
}
define <2 x i64> @test_vqdmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vqdmlal_high_lane_s32_0:
+; CHECK-LABEL: test_vqdmlal_high_lane_s32_0:
; CHECK: qdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
- %vqdmlal2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
- %vqdmlal4.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
+ %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+ %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
ret <2 x i64> %vqdmlal4.i
}
define <4 x i32> @test_vqdmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vqdmlsl_lane_s16_0:
+; CHECK-LABEL: test_vqdmlsl_lane_s16_0:
; CHECK: qdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
- %vqdmlsl2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
- %vqdmlsl4.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
+ %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+ %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
ret <4 x i32> %vqdmlsl4.i
}
define <2 x i64> @test_vqdmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vqdmlsl_lane_s32_0:
+; CHECK-LABEL: test_vqdmlsl_lane_s32_0:
; CHECK: qdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
- %vqdmlsl2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
- %vqdmlsl4.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
+ %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+ %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
ret <2 x i64> %vqdmlsl4.i
}
define <4 x i32> @test_vqdmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
-; CHECK: test_vqdmlsl_high_lane_s16_0:
+; CHECK-LABEL: test_vqdmlsl_high_lane_s16_0:
; CHECK: qdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
- %vqdmlsl2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
- %vqdmlsl4.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
+ %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+ %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
ret <4 x i32> %vqdmlsl4.i
}
define <2 x i64> @test_vqdmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
-; CHECK: test_vqdmlsl_high_lane_s32_0:
+; CHECK-LABEL: test_vqdmlsl_high_lane_s32_0:
; CHECK: qdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
- %vqdmlsl2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
- %vqdmlsl4.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
+ %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+ %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
ret <2 x i64> %vqdmlsl4.i
}
define <4 x i32> @test_vqdmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vqdmull_lane_s16_0:
+; CHECK-LABEL: test_vqdmull_lane_s16_0:
; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
- %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+ %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
ret <4 x i32> %vqdmull2.i
}
define <2 x i64> @test_vqdmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vqdmull_lane_s32_0:
+; CHECK-LABEL: test_vqdmull_lane_s32_0:
; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
- %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+ %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
ret <2 x i64> %vqdmull2.i
}
define <4 x i32> @test_vqdmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vqdmull_laneq_s16_0:
+; CHECK-LABEL: test_vqdmull_laneq_s16_0:
; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
- %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+ %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
ret <4 x i32> %vqdmull2.i
}
define <2 x i64> @test_vqdmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vqdmull_laneq_s32_0:
+; CHECK-LABEL: test_vqdmull_laneq_s32_0:
; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
- %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+ %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
ret <2 x i64> %vqdmull2.i
}
define <4 x i32> @test_vqdmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vqdmull_high_lane_s16_0:
+; CHECK-LABEL: test_vqdmull_high_lane_s16_0:
; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
- %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+ %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
ret <4 x i32> %vqdmull2.i
}
define <2 x i64> @test_vqdmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vqdmull_high_lane_s32_0:
+; CHECK-LABEL: test_vqdmull_high_lane_s32_0:
; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
- %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+ %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
ret <2 x i64> %vqdmull2.i
}
define <4 x i32> @test_vqdmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
-; CHECK: test_vqdmull_high_laneq_s16_0:
+; CHECK-LABEL: test_vqdmull_high_laneq_s16_0:
; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
- %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+ %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
ret <4 x i32> %vqdmull2.i
}
define <2 x i64> @test_vqdmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
-; CHECK: test_vqdmull_high_laneq_s32_0:
+; CHECK-LABEL: test_vqdmull_high_laneq_s32_0:
; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
- %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+ %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
ret <2 x i64> %vqdmull2.i
}
define <4 x i16> @test_vqdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vqdmulh_lane_s16_0:
+; CHECK-LABEL: test_vqdmulh_lane_s16_0:
; CHECK: qdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
- %vqdmulh2.i = tail call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
+ %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
ret <4 x i16> %vqdmulh2.i
}
define <8 x i16> @test_vqdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vqdmulhq_lane_s16_0:
+; CHECK-LABEL: test_vqdmulhq_lane_s16_0:
; CHECK: qdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
- %vqdmulh2.i = tail call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
+ %vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
ret <8 x i16> %vqdmulh2.i
}
define <2 x i32> @test_vqdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vqdmulh_lane_s32_0:
+; CHECK-LABEL: test_vqdmulh_lane_s32_0:
; CHECK: qdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
- %vqdmulh2.i = tail call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
+ %vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
ret <2 x i32> %vqdmulh2.i
}
define <4 x i32> @test_vqdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vqdmulhq_lane_s32_0:
+; CHECK-LABEL: test_vqdmulhq_lane_s32_0:
; CHECK: qdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
- %vqdmulh2.i = tail call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
+ %vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
ret <4 x i32> %vqdmulh2.i
}
define <4 x i16> @test_vqrdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vqrdmulh_lane_s16_0:
+; CHECK-LABEL: test_vqrdmulh_lane_s16_0:
; CHECK: qrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
- %vqrdmulh2.i = tail call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
+ %vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
ret <4 x i16> %vqrdmulh2.i
}
define <8 x i16> @test_vqrdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
-; CHECK: test_vqrdmulhq_lane_s16_0:
+; CHECK-LABEL: test_vqrdmulhq_lane_s16_0:
; CHECK: qrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
- %vqrdmulh2.i = tail call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
+ %vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
ret <8 x i16> %vqrdmulh2.i
}
define <2 x i32> @test_vqrdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vqrdmulh_lane_s32_0:
+; CHECK-LABEL: test_vqrdmulh_lane_s32_0:
; CHECK: qrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
- %vqrdmulh2.i = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
+ %vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
ret <2 x i32> %vqrdmulh2.i
}
define <4 x i32> @test_vqrdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
-; CHECK: test_vqrdmulhq_lane_s32_0:
+; CHECK-LABEL: test_vqrdmulhq_lane_s32_0:
; CHECK: qrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
- %vqrdmulh2.i = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
+ %vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
ret <4 x i32> %vqrdmulh2.i
}
define <2 x float> @test_vmul_lane_f32_0(<2 x float> %a, <2 x float> %v) {
-; CHECK: test_vmul_lane_f32_0:
+; CHECK-LABEL: test_vmul_lane_f32_0:
; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
@@ -2739,7 +2739,7 @@ entry:
}
define <4 x float> @test_vmulq_lane_f32_0(<4 x float> %a, <2 x float> %v) {
-; CHECK: test_vmulq_lane_f32_0:
+; CHECK-LABEL: test_vmulq_lane_f32_0:
; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
@@ -2749,7 +2749,7 @@ entry:
}
define <2 x float> @test_vmul_laneq_f32_0(<2 x float> %a, <4 x float> %v) {
-; CHECK: test_vmul_laneq_f32_0:
+; CHECK-LABEL: test_vmul_laneq_f32_0:
; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
@@ -2759,7 +2759,7 @@ entry:
}
define <1 x double> @test_vmul_laneq_f64_0(<1 x double> %a, <2 x double> %v) {
-; CHECK: test_vmul_laneq_f64_0:
+; CHECK-LABEL: test_vmul_laneq_f64_0:
; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
; CHECK-NEXT: ret
entry:
@@ -2772,7 +2772,7 @@ entry:
}
define <4 x float> @test_vmulq_laneq_f32_0(<4 x float> %a, <4 x float> %v) {
-; CHECK: test_vmulq_laneq_f32_0:
+; CHECK-LABEL: test_vmulq_laneq_f32_0:
; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
@@ -2782,7 +2782,7 @@ entry:
}
define <2 x double> @test_vmulq_laneq_f64_0(<2 x double> %a, <2 x double> %v) {
-; CHECK: test_vmulq_laneq_f64_0:
+; CHECK-LABEL: test_vmulq_laneq_f64_0:
; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
; CHECK-NEXT: ret
entry:
@@ -2792,62 +2792,62 @@ entry:
}
define <2 x float> @test_vmulx_lane_f32_0(<2 x float> %a, <2 x float> %v) {
-; CHECK: test_vmulx_lane_f32_0:
+; CHECK-LABEL: test_vmulx_lane_f32_0:
; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
- %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
+ %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
ret <2 x float> %vmulx2.i
}
define <4 x float> @test_vmulxq_lane_f32_0(<4 x float> %a, <2 x float> %v) {
-; CHECK: test_vmulxq_lane_f32_0:
+; CHECK-LABEL: test_vmulxq_lane_f32_0:
; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
- %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
+ %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
ret <4 x float> %vmulx2.i
}
define <2 x double> @test_vmulxq_lane_f64_0(<2 x double> %a, <1 x double> %v) {
-; CHECK: test_vmulxq_lane_f64_0:
+; CHECK-LABEL: test_vmulxq_lane_f64_0:
; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
- %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
+ %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
ret <2 x double> %vmulx2.i
}
define <2 x float> @test_vmulx_laneq_f32_0(<2 x float> %a, <4 x float> %v) {
-; CHECK: test_vmulx_laneq_f32_0:
+; CHECK-LABEL: test_vmulx_laneq_f32_0:
; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
- %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
+ %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
ret <2 x float> %vmulx2.i
}
define <4 x float> @test_vmulxq_laneq_f32_0(<4 x float> %a, <4 x float> %v) {
-; CHECK: test_vmulxq_laneq_f32_0:
+; CHECK-LABEL: test_vmulxq_laneq_f32_0:
; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
- %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
+ %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
ret <4 x float> %vmulx2.i
}
define <2 x double> @test_vmulxq_laneq_f64_0(<2 x double> %a, <2 x double> %v) {
-; CHECK: test_vmulxq_laneq_f64_0:
+; CHECK-LABEL: test_vmulxq_laneq_f64_0:
; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
- %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
+ %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
ret <2 x double> %vmulx2.i
}
diff --git a/test/CodeGen/AArch64/neon-3vdiff.ll b/test/CodeGen/AArch64/arm64-neon-3vdiff.ll
index 96400eb..cb9b36c 100644
--- a/test/CodeGen/AArch64/neon-3vdiff.ll
+++ b/test/CodeGen/AArch64/arm64-neon-3vdiff.ll
@@ -1,57 +1,57 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
-declare <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8>, <8 x i8>)
+declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>)
-declare <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>)
-declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>)
+declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
-declare <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>)
+declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>)
-declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
-declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>)
+declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
-declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
-declare <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32>, <2 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>)
-declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>)
+declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
-declare <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8>, <8 x i8>)
+declare <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>)
-declare <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32>, <2 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>)
-declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>)
+declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
-declare <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8>, <8 x i8>)
+declare <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8>, <8 x i8>)
-declare <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32>, <2 x i32>)
-declare <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16>, <4 x i16>)
-declare <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8>, <8 x i8>)
-declare <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32>, <2 x i32>)
-declare <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16>, <4 x i16>)
-declare <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>)
-declare <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64>, <2 x i64>)
+declare <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64>, <2 x i64>)
-declare <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32>, <4 x i32>)
+declare <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32>, <4 x i32>)
-declare <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16>, <8 x i16>)
+declare <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16>, <8 x i16>)
-declare <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64>, <2 x i64>)
+declare <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64>, <2 x i64>)
-declare <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32>, <4 x i32>)
+declare <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>)
-declare <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16>, <8 x i16>)
+declare <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16>, <8 x i16>)
define <8 x i16> @test_vaddl_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vaddl_s8:
+; CHECK-LABEL: test_vaddl_s8:
; CHECK: saddl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%vmovl.i.i = sext <8 x i8> %a to <8 x i16>
@@ -61,7 +61,7 @@ entry:
}
define <4 x i32> @test_vaddl_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vaddl_s16:
+; CHECK-LABEL: test_vaddl_s16:
; CHECK: saddl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%vmovl.i.i = sext <4 x i16> %a to <4 x i32>
@@ -71,7 +71,7 @@ entry:
}
define <2 x i64> @test_vaddl_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vaddl_s32:
+; CHECK-LABEL: test_vaddl_s32:
; CHECK: saddl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
entry:
%vmovl.i.i = sext <2 x i32> %a to <2 x i64>
@@ -81,7 +81,7 @@ entry:
}
define <8 x i16> @test_vaddl_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vaddl_u8:
+; CHECK-LABEL: test_vaddl_u8:
; CHECK: uaddl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%vmovl.i.i = zext <8 x i8> %a to <8 x i16>
@@ -91,7 +91,7 @@ entry:
}
define <4 x i32> @test_vaddl_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vaddl_u16:
+; CHECK-LABEL: test_vaddl_u16:
; CHECK: uaddl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%vmovl.i.i = zext <4 x i16> %a to <4 x i32>
@@ -101,7 +101,7 @@ entry:
}
define <2 x i64> @test_vaddl_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vaddl_u32:
+; CHECK-LABEL: test_vaddl_u32:
; CHECK: uaddl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
entry:
%vmovl.i.i = zext <2 x i32> %a to <2 x i64>
@@ -111,7 +111,7 @@ entry:
}
define <8 x i16> @test_vaddl_high_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vaddl_high_s8:
+; CHECK-LABEL: test_vaddl_high_s8:
; CHECK: saddl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -123,7 +123,7 @@ entry:
}
define <4 x i32> @test_vaddl_high_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vaddl_high_s16:
+; CHECK-LABEL: test_vaddl_high_s16:
; CHECK: saddl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -135,7 +135,7 @@ entry:
}
define <2 x i64> @test_vaddl_high_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vaddl_high_s32:
+; CHECK-LABEL: test_vaddl_high_s32:
; CHECK: saddl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -147,7 +147,7 @@ entry:
}
define <8 x i16> @test_vaddl_high_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vaddl_high_u8:
+; CHECK-LABEL: test_vaddl_high_u8:
; CHECK: uaddl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -159,7 +159,7 @@ entry:
}
define <4 x i32> @test_vaddl_high_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vaddl_high_u16:
+; CHECK-LABEL: test_vaddl_high_u16:
; CHECK: uaddl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -171,7 +171,7 @@ entry:
}
define <2 x i64> @test_vaddl_high_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vaddl_high_u32:
+; CHECK-LABEL: test_vaddl_high_u32:
; CHECK: uaddl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -183,7 +183,7 @@ entry:
}
define <8 x i16> @test_vaddw_s8(<8 x i16> %a, <8 x i8> %b) {
-; CHECK: test_vaddw_s8:
+; CHECK-LABEL: test_vaddw_s8:
; CHECK: saddw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
entry:
%vmovl.i.i = sext <8 x i8> %b to <8 x i16>
@@ -192,7 +192,7 @@ entry:
}
define <4 x i32> @test_vaddw_s16(<4 x i32> %a, <4 x i16> %b) {
-; CHECK: test_vaddw_s16:
+; CHECK-LABEL: test_vaddw_s16:
; CHECK: saddw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
entry:
%vmovl.i.i = sext <4 x i16> %b to <4 x i32>
@@ -201,7 +201,7 @@ entry:
}
define <2 x i64> @test_vaddw_s32(<2 x i64> %a, <2 x i32> %b) {
-; CHECK: test_vaddw_s32:
+; CHECK-LABEL: test_vaddw_s32:
; CHECK: saddw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
entry:
%vmovl.i.i = sext <2 x i32> %b to <2 x i64>
@@ -210,7 +210,7 @@ entry:
}
define <8 x i16> @test_vaddw_u8(<8 x i16> %a, <8 x i8> %b) {
-; CHECK: test_vaddw_u8:
+; CHECK-LABEL: test_vaddw_u8:
; CHECK: uaddw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
entry:
%vmovl.i.i = zext <8 x i8> %b to <8 x i16>
@@ -219,7 +219,7 @@ entry:
}
define <4 x i32> @test_vaddw_u16(<4 x i32> %a, <4 x i16> %b) {
-; CHECK: test_vaddw_u16:
+; CHECK-LABEL: test_vaddw_u16:
; CHECK: uaddw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
entry:
%vmovl.i.i = zext <4 x i16> %b to <4 x i32>
@@ -228,7 +228,7 @@ entry:
}
define <2 x i64> @test_vaddw_u32(<2 x i64> %a, <2 x i32> %b) {
-; CHECK: test_vaddw_u32:
+; CHECK-LABEL: test_vaddw_u32:
; CHECK: uaddw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
entry:
%vmovl.i.i = zext <2 x i32> %b to <2 x i64>
@@ -237,7 +237,7 @@ entry:
}
define <8 x i16> @test_vaddw_high_s8(<8 x i16> %a, <16 x i8> %b) {
-; CHECK: test_vaddw_high_s8:
+; CHECK-LABEL: test_vaddw_high_s8:
; CHECK: saddw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
entry:
%shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -247,7 +247,7 @@ entry:
}
define <4 x i32> @test_vaddw_high_s16(<4 x i32> %a, <8 x i16> %b) {
-; CHECK: test_vaddw_high_s16:
+; CHECK-LABEL: test_vaddw_high_s16:
; CHECK: saddw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
entry:
%shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -257,7 +257,7 @@ entry:
}
define <2 x i64> @test_vaddw_high_s32(<2 x i64> %a, <4 x i32> %b) {
-; CHECK: test_vaddw_high_s32:
+; CHECK-LABEL: test_vaddw_high_s32:
; CHECK: saddw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
entry:
%shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -267,7 +267,7 @@ entry:
}
define <8 x i16> @test_vaddw_high_u8(<8 x i16> %a, <16 x i8> %b) {
-; CHECK: test_vaddw_high_u8:
+; CHECK-LABEL: test_vaddw_high_u8:
; CHECK: uaddw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
entry:
%shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -277,7 +277,7 @@ entry:
}
define <4 x i32> @test_vaddw_high_u16(<4 x i32> %a, <8 x i16> %b) {
-; CHECK: test_vaddw_high_u16:
+; CHECK-LABEL: test_vaddw_high_u16:
; CHECK: uaddw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
entry:
%shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -287,7 +287,7 @@ entry:
}
define <2 x i64> @test_vaddw_high_u32(<2 x i64> %a, <4 x i32> %b) {
-; CHECK: test_vaddw_high_u32:
+; CHECK-LABEL: test_vaddw_high_u32:
; CHECK: uaddw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
entry:
%shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -297,7 +297,7 @@ entry:
}
define <8 x i16> @test_vsubl_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vsubl_s8:
+; CHECK-LABEL: test_vsubl_s8:
; CHECK: ssubl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%vmovl.i.i = sext <8 x i8> %a to <8 x i16>
@@ -307,7 +307,7 @@ entry:
}
define <4 x i32> @test_vsubl_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vsubl_s16:
+; CHECK-LABEL: test_vsubl_s16:
; CHECK: ssubl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%vmovl.i.i = sext <4 x i16> %a to <4 x i32>
@@ -317,7 +317,7 @@ entry:
}
define <2 x i64> @test_vsubl_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vsubl_s32:
+; CHECK-LABEL: test_vsubl_s32:
; CHECK: ssubl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
entry:
%vmovl.i.i = sext <2 x i32> %a to <2 x i64>
@@ -327,7 +327,7 @@ entry:
}
define <8 x i16> @test_vsubl_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vsubl_u8:
+; CHECK-LABEL: test_vsubl_u8:
; CHECK: usubl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%vmovl.i.i = zext <8 x i8> %a to <8 x i16>
@@ -337,7 +337,7 @@ entry:
}
define <4 x i32> @test_vsubl_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vsubl_u16:
+; CHECK-LABEL: test_vsubl_u16:
; CHECK: usubl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%vmovl.i.i = zext <4 x i16> %a to <4 x i32>
@@ -347,7 +347,7 @@ entry:
}
define <2 x i64> @test_vsubl_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vsubl_u32:
+; CHECK-LABEL: test_vsubl_u32:
; CHECK: usubl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
entry:
%vmovl.i.i = zext <2 x i32> %a to <2 x i64>
@@ -357,7 +357,7 @@ entry:
}
define <8 x i16> @test_vsubl_high_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vsubl_high_s8:
+; CHECK-LABEL: test_vsubl_high_s8:
; CHECK: ssubl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -369,7 +369,7 @@ entry:
}
define <4 x i32> @test_vsubl_high_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsubl_high_s16:
+; CHECK-LABEL: test_vsubl_high_s16:
; CHECK: ssubl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -381,7 +381,7 @@ entry:
}
define <2 x i64> @test_vsubl_high_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vsubl_high_s32:
+; CHECK-LABEL: test_vsubl_high_s32:
; CHECK: ssubl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -393,7 +393,7 @@ entry:
}
define <8 x i16> @test_vsubl_high_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vsubl_high_u8:
+; CHECK-LABEL: test_vsubl_high_u8:
; CHECK: usubl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -405,7 +405,7 @@ entry:
}
define <4 x i32> @test_vsubl_high_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsubl_high_u16:
+; CHECK-LABEL: test_vsubl_high_u16:
; CHECK: usubl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -417,7 +417,7 @@ entry:
}
define <2 x i64> @test_vsubl_high_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vsubl_high_u32:
+; CHECK-LABEL: test_vsubl_high_u32:
; CHECK: usubl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -429,7 +429,7 @@ entry:
}
define <8 x i16> @test_vsubw_s8(<8 x i16> %a, <8 x i8> %b) {
-; CHECK: test_vsubw_s8:
+; CHECK-LABEL: test_vsubw_s8:
; CHECK: ssubw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
entry:
%vmovl.i.i = sext <8 x i8> %b to <8 x i16>
@@ -438,7 +438,7 @@ entry:
}
define <4 x i32> @test_vsubw_s16(<4 x i32> %a, <4 x i16> %b) {
-; CHECK: test_vsubw_s16:
+; CHECK-LABEL: test_vsubw_s16:
; CHECK: ssubw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
entry:
%vmovl.i.i = sext <4 x i16> %b to <4 x i32>
@@ -447,7 +447,7 @@ entry:
}
define <2 x i64> @test_vsubw_s32(<2 x i64> %a, <2 x i32> %b) {
-; CHECK: test_vsubw_s32:
+; CHECK-LABEL: test_vsubw_s32:
; CHECK: ssubw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
entry:
%vmovl.i.i = sext <2 x i32> %b to <2 x i64>
@@ -456,7 +456,7 @@ entry:
}
define <8 x i16> @test_vsubw_u8(<8 x i16> %a, <8 x i8> %b) {
-; CHECK: test_vsubw_u8:
+; CHECK-LABEL: test_vsubw_u8:
; CHECK: usubw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
entry:
%vmovl.i.i = zext <8 x i8> %b to <8 x i16>
@@ -465,7 +465,7 @@ entry:
}
define <4 x i32> @test_vsubw_u16(<4 x i32> %a, <4 x i16> %b) {
-; CHECK: test_vsubw_u16:
+; CHECK-LABEL: test_vsubw_u16:
; CHECK: usubw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
entry:
%vmovl.i.i = zext <4 x i16> %b to <4 x i32>
@@ -474,7 +474,7 @@ entry:
}
define <2 x i64> @test_vsubw_u32(<2 x i64> %a, <2 x i32> %b) {
-; CHECK: test_vsubw_u32:
+; CHECK-LABEL: test_vsubw_u32:
; CHECK: usubw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
entry:
%vmovl.i.i = zext <2 x i32> %b to <2 x i64>
@@ -483,7 +483,7 @@ entry:
}
define <8 x i16> @test_vsubw_high_s8(<8 x i16> %a, <16 x i8> %b) {
-; CHECK: test_vsubw_high_s8:
+; CHECK-LABEL: test_vsubw_high_s8:
; CHECK: ssubw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
entry:
%shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -493,7 +493,7 @@ entry:
}
define <4 x i32> @test_vsubw_high_s16(<4 x i32> %a, <8 x i16> %b) {
-; CHECK: test_vsubw_high_s16:
+; CHECK-LABEL: test_vsubw_high_s16:
; CHECK: ssubw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
entry:
%shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -503,7 +503,7 @@ entry:
}
define <2 x i64> @test_vsubw_high_s32(<2 x i64> %a, <4 x i32> %b) {
-; CHECK: test_vsubw_high_s32:
+; CHECK-LABEL: test_vsubw_high_s32:
; CHECK: ssubw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
entry:
%shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -513,7 +513,7 @@ entry:
}
define <8 x i16> @test_vsubw_high_u8(<8 x i16> %a, <16 x i8> %b) {
-; CHECK: test_vsubw_high_u8:
+; CHECK-LABEL: test_vsubw_high_u8:
; CHECK: usubw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
entry:
%shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -523,7 +523,7 @@ entry:
}
define <4 x i32> @test_vsubw_high_u16(<4 x i32> %a, <8 x i16> %b) {
-; CHECK: test_vsubw_high_u16:
+; CHECK-LABEL: test_vsubw_high_u16:
; CHECK: usubw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
entry:
%shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -533,7 +533,7 @@ entry:
}
define <2 x i64> @test_vsubw_high_u32(<2 x i64> %a, <4 x i32> %b) {
-; CHECK: test_vsubw_high_u32:
+; CHECK-LABEL: test_vsubw_high_u32:
; CHECK: usubw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
entry:
%shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -543,7 +543,7 @@ entry:
}
define <8 x i8> @test_vaddhn_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vaddhn_s16:
+; CHECK-LABEL: test_vaddhn_s16:
; CHECK: addhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%vaddhn.i = add <8 x i16> %a, %b
@@ -553,7 +553,7 @@ entry:
}
define <4 x i16> @test_vaddhn_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vaddhn_s32:
+; CHECK-LABEL: test_vaddhn_s32:
; CHECK: addhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%vaddhn.i = add <4 x i32> %a, %b
@@ -563,7 +563,7 @@ entry:
}
define <2 x i32> @test_vaddhn_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vaddhn_s64:
+; CHECK-LABEL: test_vaddhn_s64:
; CHECK: addhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
entry:
%vaddhn.i = add <2 x i64> %a, %b
@@ -573,7 +573,7 @@ entry:
}
define <8 x i8> @test_vaddhn_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vaddhn_u16:
+; CHECK-LABEL: test_vaddhn_u16:
; CHECK: addhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%vaddhn.i = add <8 x i16> %a, %b
@@ -583,7 +583,7 @@ entry:
}
define <4 x i16> @test_vaddhn_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vaddhn_u32:
+; CHECK-LABEL: test_vaddhn_u32:
; CHECK: addhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%vaddhn.i = add <4 x i32> %a, %b
@@ -593,7 +593,7 @@ entry:
}
define <2 x i32> @test_vaddhn_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vaddhn_u64:
+; CHECK-LABEL: test_vaddhn_u64:
; CHECK: addhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
entry:
%vaddhn.i = add <2 x i64> %a, %b
@@ -603,7 +603,7 @@ entry:
}
define <16 x i8> @test_vaddhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vaddhn_high_s16:
+; CHECK-LABEL: test_vaddhn_high_s16:
; CHECK: addhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%vaddhn.i.i = add <8 x i16> %a, %b
@@ -617,7 +617,7 @@ entry:
}
define <8 x i16> @test_vaddhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vaddhn_high_s32:
+; CHECK-LABEL: test_vaddhn_high_s32:
; CHECK: addhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%vaddhn.i.i = add <4 x i32> %a, %b
@@ -631,7 +631,7 @@ entry:
}
define <4 x i32> @test_vaddhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vaddhn_high_s64:
+; CHECK-LABEL: test_vaddhn_high_s64:
; CHECK: addhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
entry:
%vaddhn.i.i = add <2 x i64> %a, %b
@@ -645,7 +645,7 @@ entry:
}
define <16 x i8> @test_vaddhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vaddhn_high_u16:
+; CHECK-LABEL: test_vaddhn_high_u16:
; CHECK: addhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%vaddhn.i.i = add <8 x i16> %a, %b
@@ -659,7 +659,7 @@ entry:
}
define <8 x i16> @test_vaddhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vaddhn_high_u32:
+; CHECK-LABEL: test_vaddhn_high_u32:
; CHECK: addhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%vaddhn.i.i = add <4 x i32> %a, %b
@@ -673,7 +673,7 @@ entry:
}
define <4 x i32> @test_vaddhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vaddhn_high_u64:
+; CHECK-LABEL: test_vaddhn_high_u64:
; CHECK: addhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
entry:
%vaddhn.i.i = add <2 x i64> %a, %b
@@ -687,58 +687,58 @@ entry:
}
define <8 x i8> @test_vraddhn_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vraddhn_s16:
+; CHECK-LABEL: test_vraddhn_s16:
; CHECK: raddhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
- %vraddhn2.i = tail call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+ %vraddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
ret <8 x i8> %vraddhn2.i
}
define <4 x i16> @test_vraddhn_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vraddhn_s32:
+; CHECK-LABEL: test_vraddhn_s32:
; CHECK: raddhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
- %vraddhn2.i = tail call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+ %vraddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
ret <4 x i16> %vraddhn2.i
}
define <2 x i32> @test_vraddhn_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vraddhn_s64:
+; CHECK-LABEL: test_vraddhn_s64:
; CHECK: raddhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
entry:
- %vraddhn2.i = tail call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+ %vraddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
ret <2 x i32> %vraddhn2.i
}
define <8 x i8> @test_vraddhn_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vraddhn_u16:
+; CHECK-LABEL: test_vraddhn_u16:
; CHECK: raddhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
- %vraddhn2.i = tail call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+ %vraddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
ret <8 x i8> %vraddhn2.i
}
define <4 x i16> @test_vraddhn_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vraddhn_u32:
+; CHECK-LABEL: test_vraddhn_u32:
; CHECK: raddhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
- %vraddhn2.i = tail call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+ %vraddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
ret <4 x i16> %vraddhn2.i
}
define <2 x i32> @test_vraddhn_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vraddhn_u64:
+; CHECK-LABEL: test_vraddhn_u64:
; CHECK: raddhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
entry:
- %vraddhn2.i = tail call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+ %vraddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
ret <2 x i32> %vraddhn2.i
}
define <16 x i8> @test_vraddhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vraddhn_high_s16:
+; CHECK-LABEL: test_vraddhn_high_s16:
; CHECK: raddhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
- %vraddhn2.i.i = tail call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+ %vraddhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
%0 = bitcast <8 x i8> %r to <1 x i64>
%1 = bitcast <8 x i8> %vraddhn2.i.i to <1 x i64>
%shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -747,10 +747,10 @@ entry:
}
define <8 x i16> @test_vraddhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vraddhn_high_s32:
+; CHECK-LABEL: test_vraddhn_high_s32:
; CHECK: raddhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
- %vraddhn2.i.i = tail call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+ %vraddhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
%0 = bitcast <4 x i16> %r to <1 x i64>
%1 = bitcast <4 x i16> %vraddhn2.i.i to <1 x i64>
%shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -759,10 +759,10 @@ entry:
}
define <4 x i32> @test_vraddhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vraddhn_high_s64:
+; CHECK-LABEL: test_vraddhn_high_s64:
; CHECK: raddhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
entry:
- %vraddhn2.i.i = tail call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+ %vraddhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
%0 = bitcast <2 x i32> %r to <1 x i64>
%1 = bitcast <2 x i32> %vraddhn2.i.i to <1 x i64>
%shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -771,10 +771,10 @@ entry:
}
define <16 x i8> @test_vraddhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vraddhn_high_u16:
+; CHECK-LABEL: test_vraddhn_high_u16:
; CHECK: raddhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
- %vraddhn2.i.i = tail call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+ %vraddhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
%0 = bitcast <8 x i8> %r to <1 x i64>
%1 = bitcast <8 x i8> %vraddhn2.i.i to <1 x i64>
%shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -783,10 +783,10 @@ entry:
}
define <8 x i16> @test_vraddhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vraddhn_high_u32:
+; CHECK-LABEL: test_vraddhn_high_u32:
; CHECK: raddhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
- %vraddhn2.i.i = tail call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+ %vraddhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
%0 = bitcast <4 x i16> %r to <1 x i64>
%1 = bitcast <4 x i16> %vraddhn2.i.i to <1 x i64>
%shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -795,10 +795,10 @@ entry:
}
define <4 x i32> @test_vraddhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vraddhn_high_u64:
+; CHECK-LABEL: test_vraddhn_high_u64:
; CHECK: raddhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
entry:
- %vraddhn2.i.i = tail call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+ %vraddhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
%0 = bitcast <2 x i32> %r to <1 x i64>
%1 = bitcast <2 x i32> %vraddhn2.i.i to <1 x i64>
%shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -807,7 +807,7 @@ entry:
}
define <8 x i8> @test_vsubhn_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsubhn_s16:
+; CHECK-LABEL: test_vsubhn_s16:
; CHECK: subhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%vsubhn.i = sub <8 x i16> %a, %b
@@ -817,7 +817,7 @@ entry:
}
define <4 x i16> @test_vsubhn_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vsubhn_s32:
+; CHECK-LABEL: test_vsubhn_s32:
; CHECK: subhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%vsubhn.i = sub <4 x i32> %a, %b
@@ -827,7 +827,7 @@ entry:
}
define <2 x i32> @test_vsubhn_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vsubhn_s64:
+; CHECK-LABEL: test_vsubhn_s64:
; CHECK: subhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
entry:
%vsubhn.i = sub <2 x i64> %a, %b
@@ -837,7 +837,7 @@ entry:
}
define <8 x i8> @test_vsubhn_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsubhn_u16:
+; CHECK-LABEL: test_vsubhn_u16:
; CHECK: subhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%vsubhn.i = sub <8 x i16> %a, %b
@@ -847,7 +847,7 @@ entry:
}
define <4 x i16> @test_vsubhn_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vsubhn_u32:
+; CHECK-LABEL: test_vsubhn_u32:
; CHECK: subhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%vsubhn.i = sub <4 x i32> %a, %b
@@ -857,7 +857,7 @@ entry:
}
define <2 x i32> @test_vsubhn_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vsubhn_u64:
+; CHECK-LABEL: test_vsubhn_u64:
; CHECK: subhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
entry:
%vsubhn.i = sub <2 x i64> %a, %b
@@ -867,7 +867,7 @@ entry:
}
define <16 x i8> @test_vsubhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsubhn_high_s16:
+; CHECK-LABEL: test_vsubhn_high_s16:
; CHECK: subhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%vsubhn.i.i = sub <8 x i16> %a, %b
@@ -881,7 +881,7 @@ entry:
}
define <8 x i16> @test_vsubhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vsubhn_high_s32:
+; CHECK-LABEL: test_vsubhn_high_s32:
; CHECK: subhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%vsubhn.i.i = sub <4 x i32> %a, %b
@@ -895,7 +895,7 @@ entry:
}
define <4 x i32> @test_vsubhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vsubhn_high_s64:
+; CHECK-LABEL: test_vsubhn_high_s64:
; CHECK: subhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
entry:
%vsubhn.i.i = sub <2 x i64> %a, %b
@@ -909,7 +909,7 @@ entry:
}
define <16 x i8> @test_vsubhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsubhn_high_u16:
+; CHECK-LABEL: test_vsubhn_high_u16:
; CHECK: subhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%vsubhn.i.i = sub <8 x i16> %a, %b
@@ -923,7 +923,7 @@ entry:
}
define <8 x i16> @test_vsubhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vsubhn_high_u32:
+; CHECK-LABEL: test_vsubhn_high_u32:
; CHECK: subhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%vsubhn.i.i = sub <4 x i32> %a, %b
@@ -937,7 +937,7 @@ entry:
}
define <4 x i32> @test_vsubhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vsubhn_high_u64:
+; CHECK-LABEL: test_vsubhn_high_u64:
; CHECK: subhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
entry:
%vsubhn.i.i = sub <2 x i64> %a, %b
@@ -951,58 +951,58 @@ entry:
}
define <8 x i8> @test_vrsubhn_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vrsubhn_s16:
+; CHECK-LABEL: test_vrsubhn_s16:
; CHECK: rsubhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
- %vrsubhn2.i = tail call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+ %vrsubhn2.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
ret <8 x i8> %vrsubhn2.i
}
define <4 x i16> @test_vrsubhn_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vrsubhn_s32:
+; CHECK-LABEL: test_vrsubhn_s32:
; CHECK: rsubhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
- %vrsubhn2.i = tail call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+ %vrsubhn2.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
ret <4 x i16> %vrsubhn2.i
}
define <2 x i32> @test_vrsubhn_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vrsubhn_s64:
+; CHECK-LABEL: test_vrsubhn_s64:
; CHECK: rsubhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
entry:
- %vrsubhn2.i = tail call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+ %vrsubhn2.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
ret <2 x i32> %vrsubhn2.i
}
define <8 x i8> @test_vrsubhn_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vrsubhn_u16:
+; CHECK-LABEL: test_vrsubhn_u16:
; CHECK: rsubhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
- %vrsubhn2.i = tail call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+ %vrsubhn2.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
ret <8 x i8> %vrsubhn2.i
}
define <4 x i16> @test_vrsubhn_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vrsubhn_u32:
+; CHECK-LABEL: test_vrsubhn_u32:
; CHECK: rsubhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
- %vrsubhn2.i = tail call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+ %vrsubhn2.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
ret <4 x i16> %vrsubhn2.i
}
define <2 x i32> @test_vrsubhn_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vrsubhn_u64:
+; CHECK-LABEL: test_vrsubhn_u64:
; CHECK: rsubhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
entry:
- %vrsubhn2.i = tail call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+ %vrsubhn2.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
ret <2 x i32> %vrsubhn2.i
}
define <16 x i8> @test_vrsubhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vrsubhn_high_s16:
+; CHECK-LABEL: test_vrsubhn_high_s16:
; CHECK: rsubhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
- %vrsubhn2.i.i = tail call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+ %vrsubhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
%0 = bitcast <8 x i8> %r to <1 x i64>
%1 = bitcast <8 x i8> %vrsubhn2.i.i to <1 x i64>
%shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -1011,10 +1011,10 @@ entry:
}
define <8 x i16> @test_vrsubhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vrsubhn_high_s32:
+; CHECK-LABEL: test_vrsubhn_high_s32:
; CHECK: rsubhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
- %vrsubhn2.i.i = tail call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+ %vrsubhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
%0 = bitcast <4 x i16> %r to <1 x i64>
%1 = bitcast <4 x i16> %vrsubhn2.i.i to <1 x i64>
%shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -1023,10 +1023,10 @@ entry:
}
define <4 x i32> @test_vrsubhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vrsubhn_high_s64:
+; CHECK-LABEL: test_vrsubhn_high_s64:
; CHECK: rsubhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
entry:
- %vrsubhn2.i.i = tail call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+ %vrsubhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
%0 = bitcast <2 x i32> %r to <1 x i64>
%1 = bitcast <2 x i32> %vrsubhn2.i.i to <1 x i64>
%shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -1035,10 +1035,10 @@ entry:
}
define <16 x i8> @test_vrsubhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vrsubhn_high_u16:
+; CHECK-LABEL: test_vrsubhn_high_u16:
; CHECK: rsubhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
- %vrsubhn2.i.i = tail call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+ %vrsubhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
%0 = bitcast <8 x i8> %r to <1 x i64>
%1 = bitcast <8 x i8> %vrsubhn2.i.i to <1 x i64>
%shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -1047,10 +1047,10 @@ entry:
}
define <8 x i16> @test_vrsubhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vrsubhn_high_u32:
+; CHECK-LABEL: test_vrsubhn_high_u32:
; CHECK: rsubhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
- %vrsubhn2.i.i = tail call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+ %vrsubhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
%0 = bitcast <4 x i16> %r to <1 x i64>
%1 = bitcast <4 x i16> %vrsubhn2.i.i to <1 x i64>
%shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -1059,10 +1059,10 @@ entry:
}
define <4 x i32> @test_vrsubhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vrsubhn_high_u64:
+; CHECK-LABEL: test_vrsubhn_high_u64:
; CHECK: rsubhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
entry:
- %vrsubhn2.i.i = tail call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+ %vrsubhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
%0 = bitcast <2 x i32> %r to <1 x i64>
%1 = bitcast <2 x i32> %vrsubhn2.i.i to <1 x i64>
%shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -1071,763 +1071,759 @@ entry:
}
define <8 x i16> @test_vabdl_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vabdl_s8:
+; CHECK-LABEL: test_vabdl_s8:
; CHECK: sabdl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
- %vabd.i.i = tail call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %a, <8 x i8> %b)
+ %vabd.i.i = tail call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %a, <8 x i8> %b)
%vmovl.i.i = zext <8 x i8> %vabd.i.i to <8 x i16>
ret <8 x i16> %vmovl.i.i
}
define <4 x i32> @test_vabdl_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vabdl_s16:
+; CHECK-LABEL: test_vabdl_s16:
; CHECK: sabdl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
- %vabd2.i.i = tail call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %a, <4 x i16> %b)
+ %vabd2.i.i = tail call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %a, <4 x i16> %b)
%vmovl.i.i = zext <4 x i16> %vabd2.i.i to <4 x i32>
ret <4 x i32> %vmovl.i.i
}
define <2 x i64> @test_vabdl_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vabdl_s32:
+; CHECK-LABEL: test_vabdl_s32:
; CHECK: sabdl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
entry:
- %vabd2.i.i = tail call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %a, <2 x i32> %b)
+ %vabd2.i.i = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %a, <2 x i32> %b)
%vmovl.i.i = zext <2 x i32> %vabd2.i.i to <2 x i64>
ret <2 x i64> %vmovl.i.i
}
define <8 x i16> @test_vabdl_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vabdl_u8:
+; CHECK-LABEL: test_vabdl_u8:
; CHECK: uabdl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
- %vabd.i.i = tail call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %a, <8 x i8> %b)
+ %vabd.i.i = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %a, <8 x i8> %b)
%vmovl.i.i = zext <8 x i8> %vabd.i.i to <8 x i16>
ret <8 x i16> %vmovl.i.i
}
define <4 x i32> @test_vabdl_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vabdl_u16:
+; CHECK-LABEL: test_vabdl_u16:
; CHECK: uabdl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
- %vabd2.i.i = tail call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %a, <4 x i16> %b)
+ %vabd2.i.i = tail call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %a, <4 x i16> %b)
%vmovl.i.i = zext <4 x i16> %vabd2.i.i to <4 x i32>
ret <4 x i32> %vmovl.i.i
}
define <2 x i64> @test_vabdl_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vabdl_u32:
+; CHECK-LABEL: test_vabdl_u32:
; CHECK: uabdl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
entry:
- %vabd2.i.i = tail call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %a, <2 x i32> %b)
+ %vabd2.i.i = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %a, <2 x i32> %b)
%vmovl.i.i = zext <2 x i32> %vabd2.i.i to <2 x i64>
ret <2 x i64> %vmovl.i.i
}
define <8 x i16> @test_vabal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
-; CHECK: test_vabal_s8:
+; CHECK-LABEL: test_vabal_s8:
; CHECK: sabal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
- %vabd.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %b, <8 x i8> %c)
+ %vabd.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %b, <8 x i8> %c)
%vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
%add.i = add <8 x i16> %vmovl.i.i.i, %a
ret <8 x i16> %add.i
}
define <4 x i32> @test_vabal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
-; CHECK: test_vabal_s16:
+; CHECK-LABEL: test_vabal_s16:
; CHECK: sabal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
- %vabd2.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %b, <4 x i16> %c)
+ %vabd2.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %b, <4 x i16> %c)
%vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
%add.i = add <4 x i32> %vmovl.i.i.i, %a
ret <4 x i32> %add.i
}
define <2 x i64> @test_vabal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
-; CHECK: test_vabal_s32:
+; CHECK-LABEL: test_vabal_s32:
; CHECK: sabal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
entry:
- %vabd2.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %b, <2 x i32> %c)
+ %vabd2.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %b, <2 x i32> %c)
%vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
%add.i = add <2 x i64> %vmovl.i.i.i, %a
ret <2 x i64> %add.i
}
define <8 x i16> @test_vabal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
-; CHECK: test_vabal_u8:
+; CHECK-LABEL: test_vabal_u8:
; CHECK: uabal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
- %vabd.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %b, <8 x i8> %c)
+ %vabd.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %b, <8 x i8> %c)
%vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
%add.i = add <8 x i16> %vmovl.i.i.i, %a
ret <8 x i16> %add.i
}
define <4 x i32> @test_vabal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
-; CHECK: test_vabal_u16:
+; CHECK-LABEL: test_vabal_u16:
; CHECK: uabal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
- %vabd2.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %b, <4 x i16> %c)
+ %vabd2.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %b, <4 x i16> %c)
%vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
%add.i = add <4 x i32> %vmovl.i.i.i, %a
ret <4 x i32> %add.i
}
define <2 x i64> @test_vabal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
-; CHECK: test_vabal_u32:
+; CHECK-LABEL: test_vabal_u32:
; CHECK: uabal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
entry:
- %vabd2.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %b, <2 x i32> %c)
+ %vabd2.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %b, <2 x i32> %c)
%vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
%add.i = add <2 x i64> %vmovl.i.i.i, %a
ret <2 x i64> %add.i
}
define <8 x i16> @test_vabdl_high_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vabdl_high_s8:
+; CHECK-LABEL: test_vabdl_high_s8:
; CHECK: sabdl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %vabd.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+ %vabd.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
%vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
ret <8 x i16> %vmovl.i.i.i
}
define <4 x i32> @test_vabdl_high_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vabdl_high_s16:
+; CHECK-LABEL: test_vabdl_high_s16:
; CHECK: sabdl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
- %vabd2.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+ %vabd2.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
%vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
ret <4 x i32> %vmovl.i.i.i
}
define <2 x i64> @test_vabdl_high_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vabdl_high_s32:
+; CHECK-LABEL: test_vabdl_high_s32:
; CHECK: sabdl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
- %vabd2.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+ %vabd2.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
%vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
ret <2 x i64> %vmovl.i.i.i
}
define <8 x i16> @test_vabdl_high_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vabdl_high_u8:
+; CHECK-LABEL: test_vabdl_high_u8:
; CHECK: uabdl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %vabd.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+ %vabd.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
%vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
ret <8 x i16> %vmovl.i.i.i
}
define <4 x i32> @test_vabdl_high_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vabdl_high_u16:
+; CHECK-LABEL: test_vabdl_high_u16:
; CHECK: uabdl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
- %vabd2.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+ %vabd2.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
%vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
ret <4 x i32> %vmovl.i.i.i
}
define <2 x i64> @test_vabdl_high_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vabdl_high_u32:
+; CHECK-LABEL: test_vabdl_high_u32:
; CHECK: uabdl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
- %vabd2.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+ %vabd2.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
%vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
ret <2 x i64> %vmovl.i.i.i
}
define <8 x i16> @test_vabal_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK: test_vabal_high_s8:
+; CHECK-LABEL: test_vabal_high_s8:
; CHECK: sabal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %vabd.i.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+ %vabd.i.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
%vmovl.i.i.i.i = zext <8 x i8> %vabd.i.i.i.i to <8 x i16>
%add.i.i = add <8 x i16> %vmovl.i.i.i.i, %a
ret <8 x i16> %add.i.i
}
define <4 x i32> @test_vabal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
-; CHECK: test_vabal_high_s16:
+; CHECK-LABEL: test_vabal_high_s16:
; CHECK: sabal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
- %vabd2.i.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+ %vabd2.i.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
%vmovl.i.i.i.i = zext <4 x i16> %vabd2.i.i.i.i to <4 x i32>
%add.i.i = add <4 x i32> %vmovl.i.i.i.i, %a
ret <4 x i32> %add.i.i
}
define <2 x i64> @test_vabal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
-; CHECK: test_vabal_high_s32:
+; CHECK-LABEL: test_vabal_high_s32:
; CHECK: sabal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
- %vabd2.i.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+ %vabd2.i.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
%vmovl.i.i.i.i = zext <2 x i32> %vabd2.i.i.i.i to <2 x i64>
%add.i.i = add <2 x i64> %vmovl.i.i.i.i, %a
ret <2 x i64> %add.i.i
}
define <8 x i16> @test_vabal_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK: test_vabal_high_u8:
+; CHECK-LABEL: test_vabal_high_u8:
; CHECK: uabal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %vabd.i.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+ %vabd.i.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
%vmovl.i.i.i.i = zext <8 x i8> %vabd.i.i.i.i to <8 x i16>
%add.i.i = add <8 x i16> %vmovl.i.i.i.i, %a
ret <8 x i16> %add.i.i
}
define <4 x i32> @test_vabal_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
-; CHECK: test_vabal_high_u16:
+; CHECK-LABEL: test_vabal_high_u16:
; CHECK: uabal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
- %vabd2.i.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+ %vabd2.i.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
%vmovl.i.i.i.i = zext <4 x i16> %vabd2.i.i.i.i to <4 x i32>
%add.i.i = add <4 x i32> %vmovl.i.i.i.i, %a
ret <4 x i32> %add.i.i
}
define <2 x i64> @test_vabal_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
-; CHECK: test_vabal_high_u32:
+; CHECK-LABEL: test_vabal_high_u32:
; CHECK: uabal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
- %vabd2.i.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+ %vabd2.i.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
%vmovl.i.i.i.i = zext <2 x i32> %vabd2.i.i.i.i to <2 x i64>
%add.i.i = add <2 x i64> %vmovl.i.i.i.i, %a
ret <2 x i64> %add.i.i
}
define <8 x i16> @test_vmull_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vmull_s8:
+; CHECK-LABEL: test_vmull_s8:
; CHECK: smull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
- %vmull.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %a, <8 x i8> %b)
+ %vmull.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %a, <8 x i8> %b)
ret <8 x i16> %vmull.i
}
define <4 x i32> @test_vmull_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vmull_s16:
+; CHECK-LABEL: test_vmull_s16:
; CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %b)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %b)
ret <4 x i32> %vmull2.i
}
define <2 x i64> @test_vmull_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vmull_s32:
+; CHECK-LABEL: test_vmull_s32:
; CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
entry:
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %b)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %b)
ret <2 x i64> %vmull2.i
}
define <8 x i16> @test_vmull_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vmull_u8:
+; CHECK-LABEL: test_vmull_u8:
; CHECK: umull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
- %vmull.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %a, <8 x i8> %b)
+ %vmull.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %a, <8 x i8> %b)
ret <8 x i16> %vmull.i
}
define <4 x i32> @test_vmull_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vmull_u16:
+; CHECK-LABEL: test_vmull_u16:
; CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
- %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %b)
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %b)
ret <4 x i32> %vmull2.i
}
define <2 x i64> @test_vmull_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vmull_u32:
+; CHECK-LABEL: test_vmull_u32:
; CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
entry:
- %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %b)
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %b)
ret <2 x i64> %vmull2.i
}
define <8 x i16> @test_vmull_high_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vmull_high_s8:
+; CHECK-LABEL: test_vmull_high_s8:
; CHECK: smull2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+ %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
ret <8 x i16> %vmull.i.i
}
define <4 x i32> @test_vmull_high_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vmull_high_s16:
+; CHECK-LABEL: test_vmull_high_s16:
; CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
- %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+ %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
ret <4 x i32> %vmull2.i.i
}
define <2 x i64> @test_vmull_high_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vmull_high_s32:
+; CHECK-LABEL: test_vmull_high_s32:
; CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
- %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+ %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
ret <2 x i64> %vmull2.i.i
}
define <8 x i16> @test_vmull_high_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vmull_high_u8:
+; CHECK-LABEL: test_vmull_high_u8:
; CHECK: umull2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+ %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
ret <8 x i16> %vmull.i.i
}
define <4 x i32> @test_vmull_high_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vmull_high_u16:
+; CHECK-LABEL: test_vmull_high_u16:
; CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
- %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+ %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
ret <4 x i32> %vmull2.i.i
}
define <2 x i64> @test_vmull_high_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vmull_high_u32:
+; CHECK-LABEL: test_vmull_high_u32:
; CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
- %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+ %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
ret <2 x i64> %vmull2.i.i
}
define <8 x i16> @test_vmlal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
-; CHECK: test_vmlal_s8:
+; CHECK-LABEL: test_vmlal_s8:
; CHECK: smlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
- %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %b, <8 x i8> %c)
+ %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %b, <8 x i8> %c)
%add.i = add <8 x i16> %vmull.i.i, %a
ret <8 x i16> %add.i
}
define <4 x i32> @test_vmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
-; CHECK: test_vmlal_s16:
+; CHECK-LABEL: test_vmlal_s16:
; CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
- %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %c)
+ %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %c)
%add.i = add <4 x i32> %vmull2.i.i, %a
ret <4 x i32> %add.i
}
define <2 x i64> @test_vmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
-; CHECK: test_vmlal_s32:
+; CHECK-LABEL: test_vmlal_s32:
; CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
entry:
- %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %c)
+ %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %c)
%add.i = add <2 x i64> %vmull2.i.i, %a
ret <2 x i64> %add.i
}
define <8 x i16> @test_vmlal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
-; CHECK: test_vmlal_u8:
+; CHECK-LABEL: test_vmlal_u8:
; CHECK: umlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
- %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %b, <8 x i8> %c)
+ %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %b, <8 x i8> %c)
%add.i = add <8 x i16> %vmull.i.i, %a
ret <8 x i16> %add.i
}
define <4 x i32> @test_vmlal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
-; CHECK: test_vmlal_u16:
+; CHECK-LABEL: test_vmlal_u16:
; CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
- %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %c)
+ %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %c)
%add.i = add <4 x i32> %vmull2.i.i, %a
ret <4 x i32> %add.i
}
define <2 x i64> @test_vmlal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
-; CHECK: test_vmlal_u32:
+; CHECK-LABEL: test_vmlal_u32:
; CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
entry:
- %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %c)
+ %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %c)
%add.i = add <2 x i64> %vmull2.i.i, %a
ret <2 x i64> %add.i
}
define <8 x i16> @test_vmlal_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK: test_vmlal_high_s8:
+; CHECK-LABEL: test_vmlal_high_s8:
; CHECK: smlal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %vmull.i.i.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+ %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
%add.i.i = add <8 x i16> %vmull.i.i.i, %a
ret <8 x i16> %add.i.i
}
define <4 x i32> @test_vmlal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
-; CHECK: test_vmlal_high_s16:
+; CHECK-LABEL: test_vmlal_high_s16:
; CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
- %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+ %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
%add.i.i = add <4 x i32> %vmull2.i.i.i, %a
ret <4 x i32> %add.i.i
}
define <2 x i64> @test_vmlal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
-; CHECK: test_vmlal_high_s32:
+; CHECK-LABEL: test_vmlal_high_s32:
; CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
- %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+ %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
%add.i.i = add <2 x i64> %vmull2.i.i.i, %a
ret <2 x i64> %add.i.i
}
define <8 x i16> @test_vmlal_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK: test_vmlal_high_u8:
+; CHECK-LABEL: test_vmlal_high_u8:
; CHECK: umlal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %vmull.i.i.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+ %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
%add.i.i = add <8 x i16> %vmull.i.i.i, %a
ret <8 x i16> %add.i.i
}
define <4 x i32> @test_vmlal_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
-; CHECK: test_vmlal_high_u16:
+; CHECK-LABEL: test_vmlal_high_u16:
; CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
- %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+ %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
%add.i.i = add <4 x i32> %vmull2.i.i.i, %a
ret <4 x i32> %add.i.i
}
define <2 x i64> @test_vmlal_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
-; CHECK: test_vmlal_high_u32:
+; CHECK-LABEL: test_vmlal_high_u32:
; CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
- %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+ %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
%add.i.i = add <2 x i64> %vmull2.i.i.i, %a
ret <2 x i64> %add.i.i
}
define <8 x i16> @test_vmlsl_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
-; CHECK: test_vmlsl_s8:
+; CHECK-LABEL: test_vmlsl_s8:
; CHECK: smlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
- %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %b, <8 x i8> %c)
+ %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %b, <8 x i8> %c)
%sub.i = sub <8 x i16> %a, %vmull.i.i
ret <8 x i16> %sub.i
}
define <4 x i32> @test_vmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
-; CHECK: test_vmlsl_s16:
+; CHECK-LABEL: test_vmlsl_s16:
; CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
- %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %c)
+ %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %c)
%sub.i = sub <4 x i32> %a, %vmull2.i.i
ret <4 x i32> %sub.i
}
define <2 x i64> @test_vmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
-; CHECK: test_vmlsl_s32:
+; CHECK-LABEL: test_vmlsl_s32:
; CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
entry:
- %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %c)
+ %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %c)
%sub.i = sub <2 x i64> %a, %vmull2.i.i
ret <2 x i64> %sub.i
}
define <8 x i16> @test_vmlsl_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
-; CHECK: test_vmlsl_u8:
+; CHECK-LABEL: test_vmlsl_u8:
; CHECK: umlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
- %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %b, <8 x i8> %c)
+ %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %b, <8 x i8> %c)
%sub.i = sub <8 x i16> %a, %vmull.i.i
ret <8 x i16> %sub.i
}
define <4 x i32> @test_vmlsl_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
-; CHECK: test_vmlsl_u16:
+; CHECK-LABEL: test_vmlsl_u16:
; CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
- %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %c)
+ %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %c)
%sub.i = sub <4 x i32> %a, %vmull2.i.i
ret <4 x i32> %sub.i
}
define <2 x i64> @test_vmlsl_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
-; CHECK: test_vmlsl_u32:
+; CHECK-LABEL: test_vmlsl_u32:
; CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
entry:
- %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %c)
+ %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %c)
%sub.i = sub <2 x i64> %a, %vmull2.i.i
ret <2 x i64> %sub.i
}
define <8 x i16> @test_vmlsl_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK: test_vmlsl_high_s8:
+; CHECK-LABEL: test_vmlsl_high_s8:
; CHECK: smlsl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %vmull.i.i.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+ %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
%sub.i.i = sub <8 x i16> %a, %vmull.i.i.i
ret <8 x i16> %sub.i.i
}
define <4 x i32> @test_vmlsl_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
-; CHECK: test_vmlsl_high_s16:
+; CHECK-LABEL: test_vmlsl_high_s16:
; CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
- %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+ %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
%sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
ret <4 x i32> %sub.i.i
}
define <2 x i64> @test_vmlsl_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
-; CHECK: test_vmlsl_high_s32:
+; CHECK-LABEL: test_vmlsl_high_s32:
; CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
- %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+ %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
%sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
ret <2 x i64> %sub.i.i
}
define <8 x i16> @test_vmlsl_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK: test_vmlsl_high_u8:
+; CHECK-LABEL: test_vmlsl_high_u8:
; CHECK: umlsl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %vmull.i.i.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+ %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
%sub.i.i = sub <8 x i16> %a, %vmull.i.i.i
ret <8 x i16> %sub.i.i
}
define <4 x i32> @test_vmlsl_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
-; CHECK: test_vmlsl_high_u16:
+; CHECK-LABEL: test_vmlsl_high_u16:
; CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
- %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+ %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
%sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
ret <4 x i32> %sub.i.i
}
define <2 x i64> @test_vmlsl_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
-; CHECK: test_vmlsl_high_u32:
+; CHECK-LABEL: test_vmlsl_high_u32:
; CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
- %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+ %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
%sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
ret <2 x i64> %sub.i.i
}
define <4 x i32> @test_vqdmull_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vqdmull_s16:
+; CHECK-LABEL: test_vqdmull_s16:
; CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
- %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %b)
+ %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %b)
ret <4 x i32> %vqdmull2.i
}
define <2 x i64> @test_vqdmull_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vqdmull_s32:
+; CHECK-LABEL: test_vqdmull_s32:
; CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
entry:
- %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %b)
+ %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %b)
ret <2 x i64> %vqdmull2.i
}
define <4 x i32> @test_vqdmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
-; CHECK: test_vqdmlal_s16:
+; CHECK-LABEL: test_vqdmlal_s16:
; CHECK: sqdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
- %vqdmlal2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
- %vqdmlal4.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
+ %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
+ %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
ret <4 x i32> %vqdmlal4.i
}
define <2 x i64> @test_vqdmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
-; CHECK: test_vqdmlal_s32:
+; CHECK-LABEL: test_vqdmlal_s32:
; CHECK: sqdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
entry:
- %vqdmlal2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
- %vqdmlal4.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
+ %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
+ %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
ret <2 x i64> %vqdmlal4.i
}
define <4 x i32> @test_vqdmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
-; CHECK: test_vqdmlsl_s16:
+; CHECK-LABEL: test_vqdmlsl_s16:
; CHECK: sqdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
- %vqdmlsl2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
- %vqdmlsl4.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
+ %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
+ %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
ret <4 x i32> %vqdmlsl4.i
}
define <2 x i64> @test_vqdmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
-; CHECK: test_vqdmlsl_s32:
+; CHECK-LABEL: test_vqdmlsl_s32:
; CHECK: sqdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
entry:
- %vqdmlsl2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
- %vqdmlsl4.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
+ %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
+ %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
ret <2 x i64> %vqdmlsl4.i
}
define <4 x i32> @test_vqdmull_high_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vqdmull_high_s16:
+; CHECK-LABEL: test_vqdmull_high_s16:
; CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
- %vqdmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+ %vqdmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
ret <4 x i32> %vqdmull2.i.i
}
define <2 x i64> @test_vqdmull_high_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vqdmull_high_s32:
+; CHECK-LABEL: test_vqdmull_high_s32:
; CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
- %vqdmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+ %vqdmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
ret <2 x i64> %vqdmull2.i.i
}
define <4 x i32> @test_vqdmlal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
-; CHECK: test_vqdmlal_high_s16:
+; CHECK-LABEL: test_vqdmlal_high_s16:
; CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
- %vqdmlal2.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
- %vqdmlal4.i.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i.i)
+ %vqdmlal2.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+ %vqdmlal4.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i.i)
ret <4 x i32> %vqdmlal4.i.i
}
define <2 x i64> @test_vqdmlal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
-; CHECK: test_vqdmlal_high_s32:
+; CHECK-LABEL: test_vqdmlal_high_s32:
; CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
- %vqdmlal2.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
- %vqdmlal4.i.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i.i)
+ %vqdmlal2.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+ %vqdmlal4.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i.i)
ret <2 x i64> %vqdmlal4.i.i
}
define <4 x i32> @test_vqdmlsl_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
-; CHECK: test_vqdmlsl_high_s16:
+; CHECK-LABEL: test_vqdmlsl_high_s16:
; CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
- %vqdmlsl2.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
- %vqdmlsl4.i.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i.i)
+ %vqdmlsl2.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+ %vqdmlsl4.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i.i)
ret <4 x i32> %vqdmlsl4.i.i
}
define <2 x i64> @test_vqdmlsl_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
-; CHECK: test_vqdmlsl_high_s32:
+; CHECK-LABEL: test_vqdmlsl_high_s32:
; CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
- %vqdmlsl2.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
- %vqdmlsl4.i.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i.i)
+ %vqdmlsl2.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+ %vqdmlsl4.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i.i)
ret <2 x i64> %vqdmlsl4.i.i
}
define <8 x i16> @test_vmull_p8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vmull_p8:
+; CHECK-LABEL: test_vmull_p8:
; CHECK: pmull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
- %vmull.i = tail call <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8> %a, <8 x i8> %b)
+ %vmull.i = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %a, <8 x i8> %b)
ret <8 x i16> %vmull.i
}
define <8 x i16> @test_vmull_high_p8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vmull_high_p8:
+; CHECK-LABEL: test_vmull_high_p8:
; CHECK: pmull2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+ %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
ret <8 x i16> %vmull.i.i
}
define i128 @test_vmull_p64(i64 %a, i64 %b) #4 {
-; CHECK: test_vmull_p64
+; CHECK-LABEL: test_vmull_p64
; CHECK: pmull {{v[0-9]+}}.1q, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d
entry:
- %vmull.i = insertelement <1 x i64> undef, i64 %a, i32 0
- %vmull1.i = insertelement <1 x i64> undef, i64 %b, i32 0
- %vmull2.i = tail call <16 x i8> @llvm.aarch64.neon.vmull.p64(<1 x i64> %vmull.i, <1 x i64> %vmull1.i) #1
+ %vmull2.i = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %a, i64 %b)
%vmull3.i = bitcast <16 x i8> %vmull2.i to i128
ret i128 %vmull3.i
}
define i128 @test_vmull_high_p64(<2 x i64> %a, <2 x i64> %b) #4 {
-; CHECK: test_vmull_high_p64
+; CHECK-LABEL: test_vmull_high_p64
; CHECK: pmull2 {{v[0-9]+}}.1q, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
entry:
%0 = extractelement <2 x i64> %a, i32 1
%1 = extractelement <2 x i64> %b, i32 1
- %vmull.i.i = insertelement <1 x i64> undef, i64 %0, i32 0
- %vmull1.i.i = insertelement <1 x i64> undef, i64 %1, i32 0
- %vmull2.i.i = tail call <16 x i8> @llvm.aarch64.neon.vmull.p64(<1 x i64> %vmull.i.i, <1 x i64> %vmull1.i.i) #1
+ %vmull2.i.i = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %0, i64 %1) #1
%vmull3.i.i = bitcast <16 x i8> %vmull2.i.i to i128
ret i128 %vmull3.i.i
}
-declare <16 x i8> @llvm.aarch64.neon.vmull.p64(<1 x i64>, <1 x i64>) #5
+declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64) #5
diff --git a/test/CodeGen/AArch64/neon-aba-abd.ll b/test/CodeGen/AArch64/arm64-neon-aba-abd.ll
index 5400984..6404ab7 100644
--- a/test/CodeGen/AArch64/neon-aba-abd.ll
+++ b/test/CodeGen/AArch64/arm64-neon-aba-abd.ll
@@ -1,18 +1,18 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-declare <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>)
define <8 x i8> @test_uabd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
; CHECK: test_uabd_v8i8:
- %abd = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+ %abd = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
; CHECK: uabd v0.8b, v0.8b, v1.8b
ret <8 x i8> %abd
}
define <8 x i8> @test_uaba_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
; CHECK: test_uaba_v8i8:
- %abd = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+ %abd = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
%aba = add <8 x i8> %lhs, %abd
; CHECK: uaba v0.8b, v0.8b, v1.8b
ret <8 x i8> %aba
@@ -20,32 +20,32 @@ define <8 x i8> @test_uaba_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
define <8 x i8> @test_sabd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
; CHECK: test_sabd_v8i8:
- %abd = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+ %abd = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
; CHECK: sabd v0.8b, v0.8b, v1.8b
ret <8 x i8> %abd
}
define <8 x i8> @test_saba_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
; CHECK: test_saba_v8i8:
- %abd = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+ %abd = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
%aba = add <8 x i8> %lhs, %abd
; CHECK: saba v0.8b, v0.8b, v1.8b
ret <8 x i8> %aba
}
-declare <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8>, <16 x i8>)
define <16 x i8> @test_uabd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
; CHECK: test_uabd_v16i8:
- %abd = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+ %abd = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
; CHECK: uabd v0.16b, v0.16b, v1.16b
ret <16 x i8> %abd
}
define <16 x i8> @test_uaba_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
; CHECK: test_uaba_v16i8:
- %abd = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+ %abd = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
%aba = add <16 x i8> %lhs, %abd
; CHECK: uaba v0.16b, v0.16b, v1.16b
ret <16 x i8> %aba
@@ -53,32 +53,32 @@ define <16 x i8> @test_uaba_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
define <16 x i8> @test_sabd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
; CHECK: test_sabd_v16i8:
- %abd = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+ %abd = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
; CHECK: sabd v0.16b, v0.16b, v1.16b
ret <16 x i8> %abd
}
define <16 x i8> @test_saba_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
; CHECK: test_saba_v16i8:
- %abd = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+ %abd = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
%aba = add <16 x i8> %lhs, %abd
; CHECK: saba v0.16b, v0.16b, v1.16b
ret <16 x i8> %aba
}
-declare <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16>, <4 x i16>)
define <4 x i16> @test_uabd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
; CHECK: test_uabd_v4i16:
- %abd = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+ %abd = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
; CHECK: uabd v0.4h, v0.4h, v1.4h
ret <4 x i16> %abd
}
define <4 x i16> @test_uaba_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
; CHECK: test_uaba_v4i16:
- %abd = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+ %abd = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
%aba = add <4 x i16> %lhs, %abd
; CHECK: uaba v0.4h, v0.4h, v1.4h
ret <4 x i16> %aba
@@ -86,32 +86,32 @@ define <4 x i16> @test_uaba_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
define <4 x i16> @test_sabd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
; CHECK: test_sabd_v4i16:
- %abd = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+ %abd = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
; CHECK: sabd v0.4h, v0.4h, v1.4h
ret <4 x i16> %abd
}
define <4 x i16> @test_saba_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
; CHECK: test_saba_v4i16:
- %abd = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+ %abd = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
%aba = add <4 x i16> %lhs, %abd
; CHECK: saba v0.4h, v0.4h, v1.4h
ret <4 x i16> %aba
}
-declare <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16>, <8 x i16>)
define <8 x i16> @test_uabd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
; CHECK: test_uabd_v8i16:
- %abd = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+ %abd = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
; CHECK: uabd v0.8h, v0.8h, v1.8h
ret <8 x i16> %abd
}
define <8 x i16> @test_uaba_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
; CHECK: test_uaba_v8i16:
- %abd = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+ %abd = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
%aba = add <8 x i16> %lhs, %abd
; CHECK: uaba v0.8h, v0.8h, v1.8h
ret <8 x i16> %aba
@@ -119,32 +119,32 @@ define <8 x i16> @test_uaba_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
define <8 x i16> @test_sabd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
; CHECK: test_sabd_v8i16:
- %abd = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+ %abd = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
; CHECK: sabd v0.8h, v0.8h, v1.8h
ret <8 x i16> %abd
}
define <8 x i16> @test_saba_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
; CHECK: test_saba_v8i16:
- %abd = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+ %abd = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
%aba = add <8 x i16> %lhs, %abd
; CHECK: saba v0.8h, v0.8h, v1.8h
ret <8 x i16> %aba
}
-declare <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32>, <2 x i32>)
define <2 x i32> @test_uabd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
; CHECK: test_uabd_v2i32:
- %abd = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+ %abd = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
; CHECK: uabd v0.2s, v0.2s, v1.2s
ret <2 x i32> %abd
}
define <2 x i32> @test_uaba_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
; CHECK: test_uaba_v2i32:
- %abd = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+ %abd = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
%aba = add <2 x i32> %lhs, %abd
; CHECK: uaba v0.2s, v0.2s, v1.2s
ret <2 x i32> %aba
@@ -152,16 +152,16 @@ define <2 x i32> @test_uaba_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
define <2 x i32> @test_sabd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
; CHECK: test_sabd_v2i32:
- %abd = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+ %abd = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
; CHECK: sabd v0.2s, v0.2s, v1.2s
ret <2 x i32> %abd
}
define <2 x i32> @test_sabd_v2i32_const() {
; CHECK: test_sabd_v2i32_const:
-; CHECK: movi d1, #0xffffffff0000
+; CHECK: movi d1, #0x00ffffffff0000
; CHECK-NEXT: sabd v0.2s, v0.2s, v1.2s
- %1 = tail call <2 x i32> @llvm.arm.neon.vabds.v2i32(
+ %1 = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(
<2 x i32> <i32 -2147483648, i32 2147450880>,
<2 x i32> <i32 -65536, i32 65535>)
ret <2 x i32> %1
@@ -169,25 +169,25 @@ define <2 x i32> @test_sabd_v2i32_const() {
define <2 x i32> @test_saba_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
; CHECK: test_saba_v2i32:
- %abd = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+ %abd = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
%aba = add <2 x i32> %lhs, %abd
; CHECK: saba v0.2s, v0.2s, v1.2s
ret <2 x i32> %aba
}
-declare <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32>, <4 x i32>)
define <4 x i32> @test_uabd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
; CHECK: test_uabd_v4i32:
- %abd = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+ %abd = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
; CHECK: uabd v0.4s, v0.4s, v1.4s
ret <4 x i32> %abd
}
define <4 x i32> @test_uaba_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
; CHECK: test_uaba_v4i32:
- %abd = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+ %abd = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
%aba = add <4 x i32> %lhs, %abd
; CHECK: uaba v0.4s, v0.4s, v1.4s
ret <4 x i32> %aba
@@ -195,42 +195,42 @@ define <4 x i32> @test_uaba_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
define <4 x i32> @test_sabd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
; CHECK: test_sabd_v4i32:
- %abd = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+ %abd = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
; CHECK: sabd v0.4s, v0.4s, v1.4s
ret <4 x i32> %abd
}
define <4 x i32> @test_saba_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
; CHECK: test_saba_v4i32:
- %abd = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+ %abd = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
%aba = add <4 x i32> %lhs, %abd
; CHECK: saba v0.4s, v0.4s, v1.4s
ret <4 x i32> %aba
}
-declare <2 x float> @llvm.arm.neon.vabds.v2f32(<2 x float>, <2 x float>)
+declare <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float>, <2 x float>)
define <2 x float> @test_fabd_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
; CHECK: test_fabd_v2f32:
- %abd = call <2 x float> @llvm.arm.neon.vabds.v2f32(<2 x float> %lhs, <2 x float> %rhs)
+ %abd = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> %lhs, <2 x float> %rhs)
; CHECK: fabd v0.2s, v0.2s, v1.2s
ret <2 x float> %abd
}
-declare <4 x float> @llvm.arm.neon.vabds.v4f32(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float>, <4 x float>)
define <4 x float> @test_fabd_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
; CHECK: test_fabd_v4f32:
- %abd = call <4 x float> @llvm.arm.neon.vabds.v4f32(<4 x float> %lhs, <4 x float> %rhs)
+ %abd = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> %lhs, <4 x float> %rhs)
; CHECK: fabd v0.4s, v0.4s, v1.4s
ret <4 x float> %abd
}
-declare <2 x double> @llvm.arm.neon.vabds.v2f64(<2 x double>, <2 x double>)
+declare <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double>, <2 x double>)
define <2 x double> @test_fabd_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
; CHECK: test_fabd_v2f64:
- %abd = call <2 x double> @llvm.arm.neon.vabds.v2f64(<2 x double> %lhs, <2 x double> %rhs)
+ %abd = call <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double> %lhs, <2 x double> %rhs)
; CHECK: fabd v0.2d, v0.2d, v1.2d
ret <2 x double> %abd
}
diff --git a/test/CodeGen/AArch64/arm64-neon-across.ll b/test/CodeGen/AArch64/arm64-neon-across.ll
new file mode 100644
index 0000000..3a63673
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-across.ll
@@ -0,0 +1,460 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+declare float @llvm.aarch64.neon.fminnmv.f32.v4f32(<4 x float>)
+
+declare float @llvm.aarch64.neon.fmaxnmv.f32.v4f32(<4 x float>)
+
+declare float @llvm.aarch64.neon.fminv.f32.v4f32(<4 x float>)
+
+declare float @llvm.aarch64.neon.fmaxv.f32.v4f32(<4 x float>)
+
+declare i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32>)
+
+declare i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16>)
+
+declare i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8>)
+
+declare i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16>)
+
+declare i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8>)
+
+declare i32 @llvm.aarch64.neon.uminv.i32.v4i32(<4 x i32>)
+
+declare i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16>)
+
+declare i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8>)
+
+declare i32 @llvm.aarch64.neon.sminv.i32.v4i32(<4 x i32>)
+
+declare i32 @llvm.aarch64.neon.sminv.i32.v8i16(<8 x i16>)
+
+declare i32 @llvm.aarch64.neon.sminv.i32.v16i8(<16 x i8>)
+
+declare i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16>)
+
+declare i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8>)
+
+declare i32 @llvm.aarch64.neon.sminv.i32.v4i16(<4 x i16>)
+
+declare i32 @llvm.aarch64.neon.sminv.i32.v8i8(<8 x i8>)
+
+declare i32 @llvm.aarch64.neon.umaxv.i32.v4i32(<4 x i32>)
+
+declare i32 @llvm.aarch64.neon.umaxv.i32.v8i16(<8 x i16>)
+
+declare i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8>)
+
+declare i32 @llvm.aarch64.neon.smaxv.i32.v4i32(<4 x i32>)
+
+declare i32 @llvm.aarch64.neon.smaxv.i32.v8i16(<8 x i16>)
+
+declare i32 @llvm.aarch64.neon.smaxv.i32.v16i8(<16 x i8>)
+
+declare i32 @llvm.aarch64.neon.umaxv.i32.v4i16(<4 x i16>)
+
+declare i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8>)
+
+declare i32 @llvm.aarch64.neon.smaxv.i32.v4i16(<4 x i16>)
+
+declare i32 @llvm.aarch64.neon.smaxv.i32.v8i8(<8 x i8>)
+
+declare i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32>)
+
+declare i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16>)
+
+declare i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8>)
+
+declare i64 @llvm.aarch64.neon.saddlv.i64.v4i32(<4 x i32>)
+
+declare i32 @llvm.aarch64.neon.saddlv.i32.v8i16(<8 x i16>)
+
+declare i32 @llvm.aarch64.neon.saddlv.i32.v16i8(<16 x i8>)
+
+declare i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(<4 x i16>)
+
+declare i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8>)
+
+declare i32 @llvm.aarch64.neon.saddlv.i32.v4i16(<4 x i16>)
+
+declare i32 @llvm.aarch64.neon.saddlv.i32.v8i8(<8 x i8>)
+
+define i16 @test_vaddlv_s8(<8 x i8> %a) {
+; CHECK: test_vaddlv_s8:
+; CHECK: saddlv h{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+ %saddlvv.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v8i8(<8 x i8> %a)
+ %0 = trunc i32 %saddlvv.i to i16
+ ret i16 %0
+}
+
+define i32 @test_vaddlv_s16(<4 x i16> %a) {
+; CHECK: test_vaddlv_s16:
+; CHECK: saddlv s{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+ %saddlvv.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v4i16(<4 x i16> %a)
+ ret i32 %saddlvv.i
+}
+
+define i16 @test_vaddlv_u8(<8 x i8> %a) {
+; CHECK: test_vaddlv_u8:
+; CHECK: uaddlv h{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+ %uaddlvv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8> %a)
+ %0 = trunc i32 %uaddlvv.i to i16
+ ret i16 %0
+}
+
+define i32 @test_vaddlv_u16(<4 x i16> %a) {
+; CHECK: test_vaddlv_u16:
+; CHECK: uaddlv s{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+ %uaddlvv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(<4 x i16> %a)
+ ret i32 %uaddlvv.i
+}
+
+define i16 @test_vaddlvq_s8(<16 x i8> %a) {
+; CHECK: test_vaddlvq_s8:
+; CHECK: saddlv h{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+ %saddlvv.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v16i8(<16 x i8> %a)
+ %0 = trunc i32 %saddlvv.i to i16
+ ret i16 %0
+}
+
+define i32 @test_vaddlvq_s16(<8 x i16> %a) {
+; CHECK: test_vaddlvq_s16:
+; CHECK: saddlv s{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+ %saddlvv.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v8i16(<8 x i16> %a)
+ ret i32 %saddlvv.i
+}
+
+define i64 @test_vaddlvq_s32(<4 x i32> %a) {
+; CHECK: test_vaddlvq_s32:
+; CHECK: saddlv d{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+ %saddlvv.i = tail call i64 @llvm.aarch64.neon.saddlv.i64.v4i32(<4 x i32> %a)
+ ret i64 %saddlvv.i
+}
+
+define i16 @test_vaddlvq_u8(<16 x i8> %a) {
+; CHECK: test_vaddlvq_u8:
+; CHECK: uaddlv h{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+ %uaddlvv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8> %a)
+ %0 = trunc i32 %uaddlvv.i to i16
+ ret i16 %0
+}
+
+define i32 @test_vaddlvq_u16(<8 x i16> %a) {
+; CHECK: test_vaddlvq_u16:
+; CHECK: uaddlv s{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+ %uaddlvv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16> %a)
+ ret i32 %uaddlvv.i
+}
+
+define i64 @test_vaddlvq_u32(<4 x i32> %a) {
+; CHECK: test_vaddlvq_u32:
+; CHECK: uaddlv d{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+ %uaddlvv.i = tail call i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32> %a)
+ ret i64 %uaddlvv.i
+}
+
+define i8 @test_vmaxv_s8(<8 x i8> %a) {
+; CHECK: test_vmaxv_s8:
+; CHECK: smaxv b{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+ %smaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v8i8(<8 x i8> %a)
+ %0 = trunc i32 %smaxv.i to i8
+ ret i8 %0
+}
+
+define i16 @test_vmaxv_s16(<4 x i16> %a) {
+; CHECK: test_vmaxv_s16:
+; CHECK: smaxv h{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+ %smaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v4i16(<4 x i16> %a)
+ %0 = trunc i32 %smaxv.i to i16
+ ret i16 %0
+}
+
+define i8 @test_vmaxv_u8(<8 x i8> %a) {
+; CHECK: test_vmaxv_u8:
+; CHECK: umaxv b{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+ %umaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8> %a)
+ %0 = trunc i32 %umaxv.i to i8
+ ret i8 %0
+}
+
+define i16 @test_vmaxv_u16(<4 x i16> %a) {
+; CHECK: test_vmaxv_u16:
+; CHECK: umaxv h{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+ %umaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v4i16(<4 x i16> %a)
+ %0 = trunc i32 %umaxv.i to i16
+ ret i16 %0
+}
+
+define i8 @test_vmaxvq_s8(<16 x i8> %a) {
+; CHECK: test_vmaxvq_s8:
+; CHECK: smaxv b{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+ %smaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v16i8(<16 x i8> %a)
+ %0 = trunc i32 %smaxv.i to i8
+ ret i8 %0
+}
+
+define i16 @test_vmaxvq_s16(<8 x i16> %a) {
+; CHECK: test_vmaxvq_s16:
+; CHECK: smaxv h{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+ %smaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v8i16(<8 x i16> %a)
+ %0 = trunc i32 %smaxv.i to i16
+ ret i16 %0
+}
+
+define i32 @test_vmaxvq_s32(<4 x i32> %a) {
+; CHECK: test_vmaxvq_s32:
+; CHECK: smaxv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+ %smaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v4i32(<4 x i32> %a)
+ ret i32 %smaxv.i
+}
+
+define i8 @test_vmaxvq_u8(<16 x i8> %a) {
+; CHECK: test_vmaxvq_u8:
+; CHECK: umaxv b{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+ %umaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8> %a)
+ %0 = trunc i32 %umaxv.i to i8
+ ret i8 %0
+}
+
+define i16 @test_vmaxvq_u16(<8 x i16> %a) {
+; CHECK: test_vmaxvq_u16:
+; CHECK: umaxv h{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+ %umaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v8i16(<8 x i16> %a)
+ %0 = trunc i32 %umaxv.i to i16
+ ret i16 %0
+}
+
+define i32 @test_vmaxvq_u32(<4 x i32> %a) {
+; CHECK: test_vmaxvq_u32:
+; CHECK: umaxv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+ %umaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v4i32(<4 x i32> %a)
+ ret i32 %umaxv.i
+}
+
+define i8 @test_vminv_s8(<8 x i8> %a) {
+; CHECK: test_vminv_s8:
+; CHECK: sminv b{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+ %sminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v8i8(<8 x i8> %a)
+ %0 = trunc i32 %sminv.i to i8
+ ret i8 %0
+}
+
+define i16 @test_vminv_s16(<4 x i16> %a) {
+; CHECK: test_vminv_s16:
+; CHECK: sminv h{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+ %sminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v4i16(<4 x i16> %a)
+ %0 = trunc i32 %sminv.i to i16
+ ret i16 %0
+}
+
+define i8 @test_vminv_u8(<8 x i8> %a) {
+; CHECK: test_vminv_u8:
+; CHECK: uminv b{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+ %uminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8> %a)
+ %0 = trunc i32 %uminv.i to i8
+ ret i8 %0
+}
+
+define i16 @test_vminv_u16(<4 x i16> %a) {
+; CHECK: test_vminv_u16:
+; CHECK: uminv h{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+ %uminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16> %a)
+ %0 = trunc i32 %uminv.i to i16
+ ret i16 %0
+}
+
+define i8 @test_vminvq_s8(<16 x i8> %a) {
+; CHECK: test_vminvq_s8:
+; CHECK: sminv b{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+ %sminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v16i8(<16 x i8> %a)
+ %0 = trunc i32 %sminv.i to i8
+ ret i8 %0
+}
+
+define i16 @test_vminvq_s16(<8 x i16> %a) {
+; CHECK: test_vminvq_s16:
+; CHECK: sminv h{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+ %sminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v8i16(<8 x i16> %a)
+ %0 = trunc i32 %sminv.i to i16
+ ret i16 %0
+}
+
+define i32 @test_vminvq_s32(<4 x i32> %a) {
+; CHECK: test_vminvq_s32:
+; CHECK: sminv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+ %sminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v4i32(<4 x i32> %a)
+ ret i32 %sminv.i
+}
+
+define i8 @test_vminvq_u8(<16 x i8> %a) {
+; CHECK: test_vminvq_u8:
+; CHECK: uminv b{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+ %uminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8> %a)
+ %0 = trunc i32 %uminv.i to i8
+ ret i8 %0
+}
+
+define i16 @test_vminvq_u16(<8 x i16> %a) {
+; CHECK: test_vminvq_u16:
+; CHECK: uminv h{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+ %uminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16> %a)
+ %0 = trunc i32 %uminv.i to i16
+ ret i16 %0
+}
+
+define i32 @test_vminvq_u32(<4 x i32> %a) {
+; CHECK: test_vminvq_u32:
+; CHECK: uminv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+ %uminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v4i32(<4 x i32> %a)
+ ret i32 %uminv.i
+}
+
+define i8 @test_vaddv_s8(<8 x i8> %a) {
+; CHECK: test_vaddv_s8:
+; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+ %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8> %a)
+ %0 = trunc i32 %vaddv.i to i8
+ ret i8 %0
+}
+
+define i16 @test_vaddv_s16(<4 x i16> %a) {
+; CHECK: test_vaddv_s16:
+; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+ %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16> %a)
+ %0 = trunc i32 %vaddv.i to i16
+ ret i16 %0
+}
+
+define i8 @test_vaddv_u8(<8 x i8> %a) {
+; CHECK: test_vaddv_u8:
+; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+ %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8> %a)
+ %0 = trunc i32 %vaddv.i to i8
+ ret i8 %0
+}
+
+define i16 @test_vaddv_u16(<4 x i16> %a) {
+; CHECK: test_vaddv_u16:
+; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+ %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16> %a)
+ %0 = trunc i32 %vaddv.i to i16
+ ret i16 %0
+}
+
+define i8 @test_vaddvq_s8(<16 x i8> %a) {
+; CHECK: test_vaddvq_s8:
+; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+ %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8> %a)
+ %0 = trunc i32 %vaddv.i to i8
+ ret i8 %0
+}
+
+define i16 @test_vaddvq_s16(<8 x i16> %a) {
+; CHECK: test_vaddvq_s16:
+; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+ %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16> %a)
+ %0 = trunc i32 %vaddv.i to i16
+ ret i16 %0
+}
+
+define i32 @test_vaddvq_s32(<4 x i32> %a) {
+; CHECK: test_vaddvq_s32:
+; CHECK: addv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+ %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32> %a)
+ ret i32 %vaddv.i
+}
+
+define i8 @test_vaddvq_u8(<16 x i8> %a) {
+; CHECK: test_vaddvq_u8:
+; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+ %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8> %a)
+ %0 = trunc i32 %vaddv.i to i8
+ ret i8 %0
+}
+
+define i16 @test_vaddvq_u16(<8 x i16> %a) {
+; CHECK: test_vaddvq_u16:
+; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+ %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16> %a)
+ %0 = trunc i32 %vaddv.i to i16
+ ret i16 %0
+}
+
+define i32 @test_vaddvq_u32(<4 x i32> %a) {
+; CHECK: test_vaddvq_u32:
+; CHECK: addv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+ %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32> %a)
+ ret i32 %vaddv.i
+}
+
+define float @test_vmaxvq_f32(<4 x float> %a) {
+; CHECK: test_vmaxvq_f32:
+; CHECK: fmaxv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+ %0 = call float @llvm.aarch64.neon.fmaxv.f32.v4f32(<4 x float> %a)
+ ret float %0
+}
+
+define float @test_vminvq_f32(<4 x float> %a) {
+; CHECK: test_vminvq_f32:
+; CHECK: fminv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+ %0 = call float @llvm.aarch64.neon.fminv.f32.v4f32(<4 x float> %a)
+ ret float %0
+}
+
+define float @test_vmaxnmvq_f32(<4 x float> %a) {
+; CHECK: test_vmaxnmvq_f32:
+; CHECK: fmaxnmv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+ %0 = call float @llvm.aarch64.neon.fmaxnmv.f32.v4f32(<4 x float> %a)
+ ret float %0
+}
+
+define float @test_vminnmvq_f32(<4 x float> %a) {
+; CHECK: test_vminnmvq_f32:
+; CHECK: fminnmv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+ %0 = call float @llvm.aarch64.neon.fminnmv.f32.v4f32(<4 x float> %a)
+ ret float %0
+}
+
diff --git a/test/CodeGen/AArch64/arm64-neon-add-pairwise.ll b/test/CodeGen/AArch64/arm64-neon-add-pairwise.ll
new file mode 100644
index 0000000..d3dc1b8
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-add-pairwise.ll
@@ -0,0 +1,100 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+declare <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8>, <8 x i8>)
+
+define <8 x i8> @test_addp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; CHECK: test_addp_v8i8:
+ %tmp1 = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: addp v0.8b, v0.8b, v1.8b
+ ret <8 x i8> %tmp1
+}
+
+declare <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8>, <16 x i8>)
+
+define <16 x i8> @test_addp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_addp_v16i8:
+ %tmp1 = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: addp v0.16b, v0.16b, v1.16b
+ ret <16 x i8> %tmp1
+}
+
+declare <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16>, <4 x i16>)
+
+define <4 x i16> @test_addp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_addp_v4i16:
+ %tmp1 = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: addp v0.4h, v0.4h, v1.4h
+ ret <4 x i16> %tmp1
+}
+
+declare <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @test_addp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_addp_v8i16:
+ %tmp1 = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: addp v0.8h, v0.8h, v1.8h
+ ret <8 x i16> %tmp1
+}
+
+declare <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32>, <2 x i32>)
+
+define <2 x i32> @test_addp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_addp_v2i32:
+ %tmp1 = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: addp v0.2s, v0.2s, v1.2s
+ ret <2 x i32> %tmp1
+}
+
+declare <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @test_addp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_addp_v4i32:
+ %tmp1 = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: addp v0.4s, v0.4s, v1.4s
+ ret <4 x i32> %tmp1
+}
+
+
+declare <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64>, <2 x i64>)
+
+define <2 x i64> @test_addp_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
+; CHECK: test_addp_v2i64:
+ %val = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
+; CHECK: addp v0.2d, v0.2d, v1.2d
+ ret <2 x i64> %val
+}
+
+declare <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double>, <2 x double>)
+
+define <2 x float> @test_faddp_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
+; CHECK: test_faddp_v2f32:
+ %val = call <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float> %lhs, <2 x float> %rhs)
+; CHECK: faddp v0.2s, v0.2s, v1.2s
+ ret <2 x float> %val
+}
+
+define <4 x float> @test_faddp_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
+; CHECK: test_faddp_v4f32:
+ %val = call <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float> %lhs, <4 x float> %rhs)
+; CHECK: faddp v0.4s, v0.4s, v1.4s
+ ret <4 x float> %val
+}
+
+define <2 x double> @test_faddp_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
+; CHECK: test_faddp_v2f64:
+ %val = call <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double> %lhs, <2 x double> %rhs)
+; CHECK: faddp v0.2d, v0.2d, v1.2d
+ ret <2 x double> %val
+}
+
+define i32 @test_vaddv.v2i32(<2 x i32> %a) {
+; CHECK-LABEL: test_vaddv.v2i32
+; CHECK: addp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+ %1 = tail call i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32> %a)
+ ret i32 %1
+}
+
+declare i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32>)
diff --git a/test/CodeGen/AArch64/neon-add-sub.ll b/test/CodeGen/AArch64/arm64-neon-add-sub.ll
index 9015237..fbde606 100644
--- a/test/CodeGen/AArch64/neon-add-sub.ll
+++ b/test/CodeGen/AArch64/arm64-neon-add-sub.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -aarch64-simd-scalar| FileCheck %s
define <8 x i8> @add8xi8(<8 x i8> %A, <8 x i8> %B) {
;CHECK: add {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
@@ -182,35 +182,35 @@ define <1 x double> @test_vsub_f64(<1 x double> %a, <1 x double> %b) {
define <1 x double> @test_vabd_f64(<1 x double> %a, <1 x double> %b) {
; CHECK-LABEL: test_vabd_f64
; CHECK: fabd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
- %1 = tail call <1 x double> @llvm.arm.neon.vabds.v1f64(<1 x double> %a, <1 x double> %b)
+ %1 = tail call <1 x double> @llvm.aarch64.neon.fabd.v1f64(<1 x double> %a, <1 x double> %b)
ret <1 x double> %1
}
define <1 x double> @test_vmax_f64(<1 x double> %a, <1 x double> %b) {
; CHECK-LABEL: test_vmax_f64
; CHECK: fmax d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
- %1 = tail call <1 x double> @llvm.arm.neon.vmaxs.v1f64(<1 x double> %a, <1 x double> %b)
+ %1 = tail call <1 x double> @llvm.aarch64.neon.fmax.v1f64(<1 x double> %a, <1 x double> %b)
ret <1 x double> %1
}
define <1 x double> @test_vmin_f64(<1 x double> %a, <1 x double> %b) {
; CHECK-LABEL: test_vmin_f64
; CHECK: fmin d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
- %1 = tail call <1 x double> @llvm.arm.neon.vmins.v1f64(<1 x double> %a, <1 x double> %b)
+ %1 = tail call <1 x double> @llvm.aarch64.neon.fmin.v1f64(<1 x double> %a, <1 x double> %b)
ret <1 x double> %1
}
define <1 x double> @test_vmaxnm_f64(<1 x double> %a, <1 x double> %b) {
; CHECK-LABEL: test_vmaxnm_f64
; CHECK: fmaxnm d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
- %1 = tail call <1 x double> @llvm.aarch64.neon.vmaxnm.v1f64(<1 x double> %a, <1 x double> %b)
+ %1 = tail call <1 x double> @llvm.aarch64.neon.fmaxnm.v1f64(<1 x double> %a, <1 x double> %b)
ret <1 x double> %1
}
define <1 x double> @test_vminnm_f64(<1 x double> %a, <1 x double> %b) {
; CHECK-LABEL: test_vminnm_f64
; CHECK: fminnm d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
- %1 = tail call <1 x double> @llvm.aarch64.neon.vminnm.v1f64(<1 x double> %a, <1 x double> %b)
+ %1 = tail call <1 x double> @llvm.aarch64.neon.fminnm.v1f64(<1 x double> %a, <1 x double> %b)
ret <1 x double> %1
}
@@ -229,51 +229,9 @@ define <1 x double> @test_vneg_f64(<1 x double> %a) {
}
declare <1 x double> @llvm.fabs.v1f64(<1 x double>)
-declare <1 x double> @llvm.aarch64.neon.vminnm.v1f64(<1 x double>, <1 x double>)
-declare <1 x double> @llvm.aarch64.neon.vmaxnm.v1f64(<1 x double>, <1 x double>)
-declare <1 x double> @llvm.arm.neon.vmins.v1f64(<1 x double>, <1 x double>)
-declare <1 x double> @llvm.arm.neon.vmaxs.v1f64(<1 x double>, <1 x double>)
-declare <1 x double> @llvm.arm.neon.vabds.v1f64(<1 x double>, <1 x double>)
+declare <1 x double> @llvm.aarch64.neon.fminnm.v1f64(<1 x double>, <1 x double>)
+declare <1 x double> @llvm.aarch64.neon.fmaxnm.v1f64(<1 x double>, <1 x double>)
+declare <1 x double> @llvm.aarch64.neon.fmin.v1f64(<1 x double>, <1 x double>)
+declare <1 x double> @llvm.aarch64.neon.fmax.v1f64(<1 x double>, <1 x double>)
+declare <1 x double> @llvm.aarch64.neon.fabd.v1f64(<1 x double>, <1 x double>)
declare <1 x double> @llvm.fma.v1f64(<1 x double>, <1 x double>, <1 x double>)
-
-define <1 x i8> @test_add_v1i8(<1 x i8> %a, <1 x i8> %b) {
-;CHECK-LABEL: test_add_v1i8:
-;CHECK: add {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
- %c = add <1 x i8> %a, %b
- ret <1 x i8> %c
-}
-
-define <1 x i16> @test_add_v1i16(<1 x i16> %a, <1 x i16> %b) {
-;CHECK-LABEL: test_add_v1i16:
-;CHECK: add {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
- %c = add <1 x i16> %a, %b
- ret <1 x i16> %c
-}
-
-define <1 x i32> @test_add_v1i32(<1 x i32> %a, <1 x i32> %b) {
-;CHECK-LABEL: test_add_v1i32:
-;CHECK: add {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
- %c = add <1 x i32> %a, %b
- ret <1 x i32> %c
-}
-
-define <1 x i8> @test_sub_v1i8(<1 x i8> %a, <1 x i8> %b) {
-;CHECK-LABEL: test_sub_v1i8:
-;CHECK: sub {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
- %c = sub <1 x i8> %a, %b
- ret <1 x i8> %c
-}
-
-define <1 x i16> @test_sub_v1i16(<1 x i16> %a, <1 x i16> %b) {
-;CHECK-LABEL: test_sub_v1i16:
-;CHECK: sub {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
- %c = sub <1 x i16> %a, %b
- ret <1 x i16> %c
-}
-
-define <1 x i32> @test_sub_v1i32(<1 x i32> %a, <1 x i32> %b) {
-;CHECK-LABEL: test_sub_v1i32:
-;CHECK: sub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
- %c = sub <1 x i32> %a, %b
- ret <1 x i32> %c
-} \ No newline at end of file
diff --git a/test/CodeGen/AArch64/arm64-neon-compare-instructions.ll b/test/CodeGen/AArch64/arm64-neon-compare-instructions.ll
new file mode 100644
index 0000000..cba81ef
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-compare-instructions.ll
@@ -0,0 +1,1191 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu < %s | FileCheck %s
+
+define <8 x i8> @cmeq8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+ %tmp3 = icmp eq <8 x i8> %A, %B;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+ ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmeq16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = icmp eq <16 x i8> %A, %B;
+ %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+ ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmeq4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+ %tmp3 = icmp eq <4 x i16> %A, %B;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmeq8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+ %tmp3 = icmp eq <8 x i16> %A, %B;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmeq2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+ %tmp3 = icmp eq <2 x i32> %A, %B;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmeq4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+ %tmp3 = icmp eq <4 x i32> %A, %B;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmeq2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+ %tmp3 = icmp eq <2 x i64> %A, %B;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmne8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: mvn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+ %tmp3 = icmp ne <8 x i8> %A, %B;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+ ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmne16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: mvn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = icmp ne <16 x i8> %A, %B;
+ %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+ ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmne4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+;CHECK-NEXT: mvn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+ %tmp3 = icmp ne <4 x i16> %A, %B;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmne8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+;CHECK-NEXT: mvn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = icmp ne <8 x i16> %A, %B;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmne2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+;CHECK-NEXT: mvn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+ %tmp3 = icmp ne <2 x i32> %A, %B;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmne4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+;CHECK-NEXT: mvn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = icmp ne <4 x i32> %A, %B;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmne2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+;CHECK-NEXT: mvn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = icmp ne <2 x i64> %A, %B;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmgt8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmgt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+ %tmp3 = icmp sgt <8 x i8> %A, %B;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+ ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmgt16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmgt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = icmp sgt <16 x i8> %A, %B;
+ %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+ ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmgt4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmgt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+ %tmp3 = icmp sgt <4 x i16> %A, %B;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmgt8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmgt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+ %tmp3 = icmp sgt <8 x i16> %A, %B;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmgt2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+ %tmp3 = icmp sgt <2 x i32> %A, %B;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmgt4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+ %tmp3 = icmp sgt <4 x i32> %A, %B;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmgt2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+ %tmp3 = icmp sgt <2 x i64> %A, %B;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmlt8xi8(<8 x i8> %A, <8 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.8b, v1.8b, v0.8b
+ %tmp3 = icmp slt <8 x i8> %A, %B;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+ ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmlt16xi8(<16 x i8> %A, <16 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.16b, v1.16b, v0.16b
+ %tmp3 = icmp slt <16 x i8> %A, %B;
+ %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+ ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmlt4xi16(<4 x i16> %A, <4 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.4h, v1.4h, v0.4h
+ %tmp3 = icmp slt <4 x i16> %A, %B;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmlt8xi16(<8 x i16> %A, <8 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.8h, v1.8h, v0.8h
+ %tmp3 = icmp slt <8 x i16> %A, %B;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmlt2xi32(<2 x i32> %A, <2 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
+ %tmp3 = icmp slt <2 x i32> %A, %B;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmlt4xi32(<4 x i32> %A, <4 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
+ %tmp3 = icmp slt <4 x i32> %A, %B;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmlt2xi64(<2 x i64> %A, <2 x i64> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
+ %tmp3 = icmp slt <2 x i64> %A, %B;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmge8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmge {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+ %tmp3 = icmp sge <8 x i8> %A, %B;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+ ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmge16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmge {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = icmp sge <16 x i8> %A, %B;
+ %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+ ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmge4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmge {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+ %tmp3 = icmp sge <4 x i16> %A, %B;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmge8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmge {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+ %tmp3 = icmp sge <8 x i16> %A, %B;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmge2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+ %tmp3 = icmp sge <2 x i32> %A, %B;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmge4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+ %tmp3 = icmp sge <4 x i32> %A, %B;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmge2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+ %tmp3 = icmp sge <2 x i64> %A, %B;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmle8xi8(<8 x i8> %A, <8 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.8b, v1.8b, v0.8b
+ %tmp3 = icmp sle <8 x i8> %A, %B;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+ ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmle16xi8(<16 x i8> %A, <16 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.16b, v1.16b, v0.16b
+ %tmp3 = icmp sle <16 x i8> %A, %B;
+ %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+ ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmle4xi16(<4 x i16> %A, <4 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.4h, v1.4h, v0.4h
+ %tmp3 = icmp sle <4 x i16> %A, %B;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmle8xi16(<8 x i16> %A, <8 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.8h, v1.8h, v0.8h
+ %tmp3 = icmp sle <8 x i16> %A, %B;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmle2xi32(<2 x i32> %A, <2 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.2s, v1.2s, v0.2s
+ %tmp3 = icmp sle <2 x i32> %A, %B;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmle4xi32(<4 x i32> %A, <4 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.4s, v1.4s, v0.4s
+ %tmp3 = icmp sle <4 x i32> %A, %B;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmle2xi64(<2 x i64> %A, <2 x i64> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.2d, v1.2d, v0.2d
+ %tmp3 = icmp sle <2 x i64> %A, %B;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmhi8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmhi {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+ %tmp3 = icmp ugt <8 x i8> %A, %B;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+ ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmhi16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmhi {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = icmp ugt <16 x i8> %A, %B;
+ %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+ ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmhi4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmhi {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+ %tmp3 = icmp ugt <4 x i16> %A, %B;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmhi8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmhi {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+ %tmp3 = icmp ugt <8 x i16> %A, %B;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmhi2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmhi {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+ %tmp3 = icmp ugt <2 x i32> %A, %B;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmhi4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmhi {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+ %tmp3 = icmp ugt <4 x i32> %A, %B;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmhi2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmhi {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+ %tmp3 = icmp ugt <2 x i64> %A, %B;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmlo8xi8(<8 x i8> %A, <8 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.8b, v1.8b, v0.8b
+ %tmp3 = icmp ult <8 x i8> %A, %B;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+ ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmlo16xi8(<16 x i8> %A, <16 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.16b, v1.16b, v0.16b
+ %tmp3 = icmp ult <16 x i8> %A, %B;
+ %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+ ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmlo4xi16(<4 x i16> %A, <4 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.4h, v1.4h, v0.4h
+ %tmp3 = icmp ult <4 x i16> %A, %B;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmlo8xi16(<8 x i16> %A, <8 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.8h, v1.8h, v0.8h
+ %tmp3 = icmp ult <8 x i16> %A, %B;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmlo2xi32(<2 x i32> %A, <2 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.2s, v1.2s, v0.2s
+ %tmp3 = icmp ult <2 x i32> %A, %B;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmlo4xi32(<4 x i32> %A, <4 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.4s, v1.4s, v0.4s
+ %tmp3 = icmp ult <4 x i32> %A, %B;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmlo2xi64(<2 x i64> %A, <2 x i64> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.2d, v1.2d, v0.2d
+ %tmp3 = icmp ult <2 x i64> %A, %B;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmhs8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+ %tmp3 = icmp uge <8 x i8> %A, %B;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+ ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmhs16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = icmp uge <16 x i8> %A, %B;
+ %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+ ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmhs4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmhs {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+ %tmp3 = icmp uge <4 x i16> %A, %B;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmhs8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmhs {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+ %tmp3 = icmp uge <8 x i16> %A, %B;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmhs2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmhs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+ %tmp3 = icmp uge <2 x i32> %A, %B;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmhs4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmhs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+ %tmp3 = icmp uge <4 x i32> %A, %B;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmhs2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmhs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+ %tmp3 = icmp uge <2 x i64> %A, %B;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmls8xi8(<8 x i8> %A, <8 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.8b, v1.8b, v0.8b
+ %tmp3 = icmp ule <8 x i8> %A, %B;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+ ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmls16xi8(<16 x i8> %A, <16 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.16b, v1.16b, v0.16b
+ %tmp3 = icmp ule <16 x i8> %A, %B;
+ %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+ ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmls4xi16(<4 x i16> %A, <4 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.4h, v1.4h, v0.4h
+ %tmp3 = icmp ule <4 x i16> %A, %B;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmls8xi16(<8 x i16> %A, <8 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.8h, v1.8h, v0.8h
+ %tmp3 = icmp ule <8 x i16> %A, %B;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmls2xi32(<2 x i32> %A, <2 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.2s, v1.2s, v0.2s
+ %tmp3 = icmp ule <2 x i32> %A, %B;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmls4xi32(<4 x i32> %A, <4 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.4s, v1.4s, v0.4s
+ %tmp3 = icmp ule <4 x i32> %A, %B;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmls2xi64(<2 x i64> %A, <2 x i64> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.2d, v1.2d, v0.2d
+ %tmp3 = icmp ule <2 x i64> %A, %B;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+
+define <8 x i8> @cmeqz8xi8(<8 x i8> %A) {
+;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
+ %tmp3 = icmp eq <8 x i8> %A, zeroinitializer;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+ ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmeqz16xi8(<16 x i8> %A) {
+;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
+ %tmp3 = icmp eq <16 x i8> %A, zeroinitializer;
+ %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+ ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmeqz4xi16(<4 x i16> %A) {
+;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
+ %tmp3 = icmp eq <4 x i16> %A, zeroinitializer;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmeqz8xi16(<8 x i16> %A) {
+;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
+ %tmp3 = icmp eq <8 x i16> %A, zeroinitializer;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmeqz2xi32(<2 x i32> %A) {
+;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+ %tmp3 = icmp eq <2 x i32> %A, zeroinitializer;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmeqz4xi32(<4 x i32> %A) {
+;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+ %tmp3 = icmp eq <4 x i32> %A, zeroinitializer;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmeqz2xi64(<2 x i64> %A) {
+;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+ %tmp3 = icmp eq <2 x i64> %A, zeroinitializer;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+
+define <8 x i8> @cmgez8xi8(<8 x i8> %A) {
+;CHECK: cmge {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
+ %tmp3 = icmp sge <8 x i8> %A, zeroinitializer;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+ ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmgez16xi8(<16 x i8> %A) {
+;CHECK: cmge {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
+ %tmp3 = icmp sge <16 x i8> %A, zeroinitializer;
+ %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+ ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmgez4xi16(<4 x i16> %A) {
+;CHECK: cmge {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
+ %tmp3 = icmp sge <4 x i16> %A, zeroinitializer;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmgez8xi16(<8 x i16> %A) {
+;CHECK: cmge {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
+ %tmp3 = icmp sge <8 x i16> %A, zeroinitializer;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmgez2xi32(<2 x i32> %A) {
+;CHECK: cmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+ %tmp3 = icmp sge <2 x i32> %A, zeroinitializer;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmgez4xi32(<4 x i32> %A) {
+;CHECK: cmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+ %tmp3 = icmp sge <4 x i32> %A, zeroinitializer;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmgez2xi64(<2 x i64> %A) {
+;CHECK: cmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+ %tmp3 = icmp sge <2 x i64> %A, zeroinitializer;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+
+define <8 x i8> @cmgtz8xi8(<8 x i8> %A) {
+;CHECK: cmgt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
+ %tmp3 = icmp sgt <8 x i8> %A, zeroinitializer;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+ ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmgtz16xi8(<16 x i8> %A) {
+;CHECK: cmgt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
+ %tmp3 = icmp sgt <16 x i8> %A, zeroinitializer;
+ %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+ ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmgtz4xi16(<4 x i16> %A) {
+;CHECK: cmgt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
+ %tmp3 = icmp sgt <4 x i16> %A, zeroinitializer;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmgtz8xi16(<8 x i16> %A) {
+;CHECK: cmgt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
+ %tmp3 = icmp sgt <8 x i16> %A, zeroinitializer;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmgtz2xi32(<2 x i32> %A) {
+;CHECK: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+ %tmp3 = icmp sgt <2 x i32> %A, zeroinitializer;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmgtz4xi32(<4 x i32> %A) {
+;CHECK: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+ %tmp3 = icmp sgt <4 x i32> %A, zeroinitializer;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmgtz2xi64(<2 x i64> %A) {
+;CHECK: cmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+ %tmp3 = icmp sgt <2 x i64> %A, zeroinitializer;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmlez8xi8(<8 x i8> %A) {
+;CHECK: cmle {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
+ %tmp3 = icmp sle <8 x i8> %A, zeroinitializer;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+ ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmlez16xi8(<16 x i8> %A) {
+;CHECK: cmle {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
+ %tmp3 = icmp sle <16 x i8> %A, zeroinitializer;
+ %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+ ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmlez4xi16(<4 x i16> %A) {
+;CHECK: cmle {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
+ %tmp3 = icmp sle <4 x i16> %A, zeroinitializer;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmlez8xi16(<8 x i16> %A) {
+;CHECK: cmle {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
+ %tmp3 = icmp sle <8 x i16> %A, zeroinitializer;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmlez2xi32(<2 x i32> %A) {
+;CHECK: cmle {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+ %tmp3 = icmp sle <2 x i32> %A, zeroinitializer;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmlez4xi32(<4 x i32> %A) {
+;CHECK: cmle {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+ %tmp3 = icmp sle <4 x i32> %A, zeroinitializer;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmlez2xi64(<2 x i64> %A) {
+;CHECK: cmle {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+ %tmp3 = icmp sle <2 x i64> %A, zeroinitializer;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmltz8xi8(<8 x i8> %A) {
+;CHECK: cmlt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
+ %tmp3 = icmp slt <8 x i8> %A, zeroinitializer;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+ ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmltz16xi8(<16 x i8> %A) {
+;CHECK: cmlt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
+ %tmp3 = icmp slt <16 x i8> %A, zeroinitializer;
+ %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+ ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmltz4xi16(<4 x i16> %A) {
+;CHECK: cmlt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
+ %tmp3 = icmp slt <4 x i16> %A, zeroinitializer;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmltz8xi16(<8 x i16> %A) {
+;CHECK: cmlt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
+ %tmp3 = icmp slt <8 x i16> %A, zeroinitializer;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmltz2xi32(<2 x i32> %A) {
+;CHECK: cmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+ %tmp3 = icmp slt <2 x i32> %A, zeroinitializer;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmltz4xi32(<4 x i32> %A) {
+;CHECK: cmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+ %tmp3 = icmp slt <4 x i32> %A, zeroinitializer;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmltz2xi64(<2 x i64> %A) {
+;CHECK: cmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+ %tmp3 = icmp slt <2 x i64> %A, zeroinitializer;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmneqz8xi8(<8 x i8> %A) {
+;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
+;CHECK-NEXT: mvn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+ %tmp3 = icmp ne <8 x i8> %A, zeroinitializer;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+ ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmneqz16xi8(<16 x i8> %A) {
+;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
+;CHECK-NEXT: mvn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = icmp ne <16 x i8> %A, zeroinitializer;
+ %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+ ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmneqz4xi16(<4 x i16> %A) {
+;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
+;CHECK-NEXT: mvn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+ %tmp3 = icmp ne <4 x i16> %A, zeroinitializer;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmneqz8xi16(<8 x i16> %A) {
+;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
+;CHECK-NEXT: mvn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = icmp ne <8 x i16> %A, zeroinitializer;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmneqz2xi32(<2 x i32> %A) {
+;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+;CHECK-NEXT: mvn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+ %tmp3 = icmp ne <2 x i32> %A, zeroinitializer;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmneqz4xi32(<4 x i32> %A) {
+;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+;CHECK-NEXT: mvn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = icmp ne <4 x i32> %A, zeroinitializer;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmneqz2xi64(<2 x i64> %A) {
+;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+;CHECK-NEXT: mvn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = icmp ne <2 x i64> %A, zeroinitializer;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmhsz8xi8(<8 x i8> %A) {
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, v[[ZERO]].8b
+ %tmp3 = icmp uge <8 x i8> %A, zeroinitializer;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+ ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmhsz16xi8(<16 x i8> %A) {
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, v[[ZERO]].16b
+ %tmp3 = icmp uge <16 x i8> %A, zeroinitializer;
+ %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+ ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmhsz4xi16(<4 x i16> %A) {
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v[[ZERO]].4h
+ %tmp3 = icmp uge <4 x i16> %A, zeroinitializer;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmhsz8xi16(<8 x i16> %A) {
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v[[ZERO]].8h
+ %tmp3 = icmp uge <8 x i16> %A, zeroinitializer;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmhsz2xi32(<2 x i32> %A) {
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, v[[ZERO]].2s
+ %tmp3 = icmp uge <2 x i32> %A, zeroinitializer;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmhsz4xi32(<4 x i32> %A) {
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, v[[ZERO]].4s
+ %tmp3 = icmp uge <4 x i32> %A, zeroinitializer;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmhsz2xi64(<2 x i64> %A) {
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, v[[ZERO]].2d
+ %tmp3 = icmp uge <2 x i64> %A, zeroinitializer;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+
+define <8 x i8> @cmhiz8xi8(<8 x i8> %A) {
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, v[[ZERO]].8b
+ %tmp3 = icmp ugt <8 x i8> %A, zeroinitializer;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+ ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmhiz16xi8(<16 x i8> %A) {
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, v[[ZERO]].16b
+ %tmp3 = icmp ugt <16 x i8> %A, zeroinitializer;
+ %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+ ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmhiz4xi16(<4 x i16> %A) {
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v[[ZERO]].4h
+ %tmp3 = icmp ugt <4 x i16> %A, zeroinitializer;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmhiz8xi16(<8 x i16> %A) {
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v[[ZERO]].8h
+ %tmp3 = icmp ugt <8 x i16> %A, zeroinitializer;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmhiz2xi32(<2 x i32> %A) {
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, v[[ZERO]].2s
+ %tmp3 = icmp ugt <2 x i32> %A, zeroinitializer;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmhiz4xi32(<4 x i32> %A) {
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, v[[ZERO]].4s
+ %tmp3 = icmp ugt <4 x i32> %A, zeroinitializer;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmhiz2xi64(<2 x i64> %A) {
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, v[[ZERO]].2d
+ %tmp3 = icmp ugt <2 x i64> %A, zeroinitializer;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmlsz8xi8(<8 x i8> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.8b, v[[ZERO]].8b, v0.8b
+ %tmp3 = icmp ule <8 x i8> %A, zeroinitializer;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+ ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmlsz16xi8(<16 x i8> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.16b, v[[ZERO]].16b, v0.16b
+ %tmp3 = icmp ule <16 x i8> %A, zeroinitializer;
+ %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+ ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmlsz4xi16(<4 x i16> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.4h, v[[ZERO]].4h, v0.4h
+ %tmp3 = icmp ule <4 x i16> %A, zeroinitializer;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmlsz8xi16(<8 x i16> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.8h, v[[ZERO]].8h, v0.8h
+ %tmp3 = icmp ule <8 x i16> %A, zeroinitializer;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmlsz2xi32(<2 x i32> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.2s, v[[ZERO]].2s, v0.2s
+ %tmp3 = icmp ule <2 x i32> %A, zeroinitializer;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmlsz4xi32(<4 x i32> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.4s, v[[ZERO]].4s, v0.4s
+ %tmp3 = icmp ule <4 x i32> %A, zeroinitializer;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmlsz2xi64(<2 x i64> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.2d, v[[ZERO]].2d, v0.2d
+ %tmp3 = icmp ule <2 x i64> %A, zeroinitializer;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmloz8xi8(<8 x i8> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.8b, v[[ZERO]].8b, {{v[0-9]+}}.8b
+ %tmp3 = icmp ult <8 x i8> %A, zeroinitializer;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+ ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmloz16xi8(<16 x i8> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.16b, v[[ZERO]].16b, v0.16b
+ %tmp3 = icmp ult <16 x i8> %A, zeroinitializer;
+ %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+ ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmloz4xi16(<4 x i16> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.4h, v[[ZERO]].4h, v0.4h
+ %tmp3 = icmp ult <4 x i16> %A, zeroinitializer;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmloz8xi16(<8 x i16> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.8h, v[[ZERO]].8h, v0.8h
+ %tmp3 = icmp ult <8 x i16> %A, zeroinitializer;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmloz2xi32(<2 x i32> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.2s, v[[ZERO]].2s, v0.2s
+ %tmp3 = icmp ult <2 x i32> %A, zeroinitializer;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmloz4xi32(<4 x i32> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.4s, v[[ZERO]].4s, v0.4s
+ %tmp3 = icmp ult <4 x i32> %A, zeroinitializer;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmloz2xi64(<2 x i64> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.2d, v[[ZERO]].2d, v0.2d
+ %tmp3 = icmp ult <2 x i64> %A, zeroinitializer;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <1 x i64> @cmeqz_v1i64(<1 x i64> %A) {
+; CHECK-LABEL: cmeqz_v1i64:
+; CHECK: cmeq d0, d0, #0
+ %tst = icmp eq <1 x i64> %A, <i64 0>
+ %mask = sext <1 x i1> %tst to <1 x i64>
+ ret <1 x i64> %mask
+}
+
+define <1 x i64> @cmgez_v1i64(<1 x i64> %A) {
+; CHECK-LABEL: cmgez_v1i64:
+; CHECK: cmge d0, d0, #0
+ %tst = icmp sge <1 x i64> %A, <i64 0>
+ %mask = sext <1 x i1> %tst to <1 x i64>
+ ret <1 x i64> %mask
+}
+
+define <1 x i64> @cmgtz_v1i64(<1 x i64> %A) {
+; CHECK-LABEL: cmgtz_v1i64:
+; CHECK: cmgt d0, d0, #0
+ %tst = icmp sgt <1 x i64> %A, <i64 0>
+ %mask = sext <1 x i1> %tst to <1 x i64>
+ ret <1 x i64> %mask
+}
+
+define <1 x i64> @cmlez_v1i64(<1 x i64> %A) {
+; CHECK-LABEL: cmlez_v1i64:
+; CHECK: cmle d0, d0, #0
+ %tst = icmp sle <1 x i64> %A, <i64 0>
+ %mask = sext <1 x i1> %tst to <1 x i64>
+ ret <1 x i64> %mask
+}
+
+define <1 x i64> @cmltz_v1i64(<1 x i64> %A) {
+; CHECK-LABEL: cmltz_v1i64:
+; CHECK: cmlt d0, d0, #0
+ %tst = icmp slt <1 x i64> %A, <i64 0>
+ %mask = sext <1 x i1> %tst to <1 x i64>
+ ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmeqz_v1f64(<1 x double> %A) {
+; CHECK-LABEL: fcmeqz_v1f64:
+; CHECK: fcmeq d0, d0, #0
+ %tst = fcmp oeq <1 x double> %A, <double 0.0>
+ %mask = sext <1 x i1> %tst to <1 x i64>
+ ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmgez_v1f64(<1 x double> %A) {
+; CHECK-LABEL: fcmgez_v1f64:
+; CHECK: fcmge d0, d0, #0
+ %tst = fcmp oge <1 x double> %A, <double 0.0>
+ %mask = sext <1 x i1> %tst to <1 x i64>
+ ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmgtz_v1f64(<1 x double> %A) {
+; CHECK-LABEL: fcmgtz_v1f64:
+; CHECK: fcmgt d0, d0, #0
+ %tst = fcmp ogt <1 x double> %A, <double 0.0>
+ %mask = sext <1 x i1> %tst to <1 x i64>
+ ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmlez_v1f64(<1 x double> %A) {
+; CHECK-LABEL: fcmlez_v1f64:
+; CHECK: fcmle d0, d0, #0
+ %tst = fcmp ole <1 x double> %A, <double 0.0>
+ %mask = sext <1 x i1> %tst to <1 x i64>
+ ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmltz_v1f64(<1 x double> %A) {
+; CHECK-LABEL: fcmltz_v1f64:
+; CHECK: fcmlt d0, d0, #0
+ %tst = fcmp olt <1 x double> %A, <double 0.0>
+ %mask = sext <1 x i1> %tst to <1 x i64>
+ ret <1 x i64> %mask
+}
diff --git a/test/CodeGen/AArch64/neon-copy.ll b/test/CodeGen/AArch64/arm64-neon-copy.ll
index b4d55df..cfc2ebf 100644
--- a/test/CodeGen/AArch64/neon-copy.ll
+++ b/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -1,368 +1,423 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
define <16 x i8> @ins16bw(<16 x i8> %tmp1, i8 %tmp2) {
-;CHECK: ins {{v[0-9]+}}.b[15], {{w[0-9]+}}
+; CHECK-LABEL: ins16bw:
+; CHECK: ins {{v[0-9]+}}.b[15], {{w[0-9]+}}
%tmp3 = insertelement <16 x i8> %tmp1, i8 %tmp2, i32 15
ret <16 x i8> %tmp3
}
define <8 x i16> @ins8hw(<8 x i16> %tmp1, i16 %tmp2) {
-;CHECK: ins {{v[0-9]+}}.h[6], {{w[0-9]+}}
+; CHECK-LABEL: ins8hw:
+; CHECK: ins {{v[0-9]+}}.h[6], {{w[0-9]+}}
%tmp3 = insertelement <8 x i16> %tmp1, i16 %tmp2, i32 6
ret <8 x i16> %tmp3
}
define <4 x i32> @ins4sw(<4 x i32> %tmp1, i32 %tmp2) {
-;CHECK: ins {{v[0-9]+}}.s[2], {{w[0-9]+}}
+; CHECK-LABEL: ins4sw:
+; CHECK: ins {{v[0-9]+}}.s[2], {{w[0-9]+}}
%tmp3 = insertelement <4 x i32> %tmp1, i32 %tmp2, i32 2
ret <4 x i32> %tmp3
}
define <2 x i64> @ins2dw(<2 x i64> %tmp1, i64 %tmp2) {
-;CHECK: ins {{v[0-9]+}}.d[1], {{x[0-9]+}}
+; CHECK-LABEL: ins2dw:
+; CHECK: ins {{v[0-9]+}}.d[1], {{x[0-9]+}}
%tmp3 = insertelement <2 x i64> %tmp1, i64 %tmp2, i32 1
ret <2 x i64> %tmp3
}
define <8 x i8> @ins8bw(<8 x i8> %tmp1, i8 %tmp2) {
-;CHECK: ins {{v[0-9]+}}.b[5], {{w[0-9]+}}
+; CHECK-LABEL: ins8bw:
+; CHECK: ins {{v[0-9]+}}.b[5], {{w[0-9]+}}
%tmp3 = insertelement <8 x i8> %tmp1, i8 %tmp2, i32 5
ret <8 x i8> %tmp3
}
define <4 x i16> @ins4hw(<4 x i16> %tmp1, i16 %tmp2) {
-;CHECK: ins {{v[0-9]+}}.h[3], {{w[0-9]+}}
+; CHECK-LABEL: ins4hw:
+; CHECK: ins {{v[0-9]+}}.h[3], {{w[0-9]+}}
%tmp3 = insertelement <4 x i16> %tmp1, i16 %tmp2, i32 3
ret <4 x i16> %tmp3
}
define <2 x i32> @ins2sw(<2 x i32> %tmp1, i32 %tmp2) {
-;CHECK: ins {{v[0-9]+}}.s[1], {{w[0-9]+}}
+; CHECK-LABEL: ins2sw:
+; CHECK: ins {{v[0-9]+}}.s[1], {{w[0-9]+}}
%tmp3 = insertelement <2 x i32> %tmp1, i32 %tmp2, i32 1
ret <2 x i32> %tmp3
}
define <16 x i8> @ins16b16(<16 x i8> %tmp1, <16 x i8> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.b[15], {{v[0-9]+}}.b[2]
+; CHECK-LABEL: ins16b16:
+; CHECK: ins {{v[0-9]+}}.b[15], {{v[0-9]+}}.b[2]
%tmp3 = extractelement <16 x i8> %tmp1, i32 2
%tmp4 = insertelement <16 x i8> %tmp2, i8 %tmp3, i32 15
ret <16 x i8> %tmp4
}
define <8 x i16> @ins8h8(<8 x i16> %tmp1, <8 x i16> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.h[7], {{v[0-9]+}}.h[2]
+; CHECK-LABEL: ins8h8:
+; CHECK: ins {{v[0-9]+}}.h[7], {{v[0-9]+}}.h[2]
%tmp3 = extractelement <8 x i16> %tmp1, i32 2
%tmp4 = insertelement <8 x i16> %tmp2, i16 %tmp3, i32 7
ret <8 x i16> %tmp4
}
define <4 x i32> @ins4s4(<4 x i32> %tmp1, <4 x i32> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2]
+; CHECK-LABEL: ins4s4:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2]
%tmp3 = extractelement <4 x i32> %tmp1, i32 2
%tmp4 = insertelement <4 x i32> %tmp2, i32 %tmp3, i32 1
ret <4 x i32> %tmp4
}
define <2 x i64> @ins2d2(<2 x i64> %tmp1, <2 x i64> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK-LABEL: ins2d2:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
%tmp3 = extractelement <2 x i64> %tmp1, i32 0
%tmp4 = insertelement <2 x i64> %tmp2, i64 %tmp3, i32 1
ret <2 x i64> %tmp4
}
define <4 x float> @ins4f4(<4 x float> %tmp1, <4 x float> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2]
+; CHECK-LABEL: ins4f4:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2]
%tmp3 = extractelement <4 x float> %tmp1, i32 2
%tmp4 = insertelement <4 x float> %tmp2, float %tmp3, i32 1
ret <4 x float> %tmp4
}
define <2 x double> @ins2df2(<2 x double> %tmp1, <2 x double> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK-LABEL: ins2df2:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
%tmp3 = extractelement <2 x double> %tmp1, i32 0
%tmp4 = insertelement <2 x double> %tmp2, double %tmp3, i32 1
ret <2 x double> %tmp4
}
define <16 x i8> @ins8b16(<8 x i8> %tmp1, <16 x i8> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.b[15], {{v[0-9]+}}.b[2]
+; CHECK-LABEL: ins8b16:
+; CHECK: ins {{v[0-9]+}}.b[15], {{v[0-9]+}}.b[2]
%tmp3 = extractelement <8 x i8> %tmp1, i32 2
%tmp4 = insertelement <16 x i8> %tmp2, i8 %tmp3, i32 15
ret <16 x i8> %tmp4
}
define <8 x i16> @ins4h8(<4 x i16> %tmp1, <8 x i16> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.h[7], {{v[0-9]+}}.h[2]
+; CHECK-LABEL: ins4h8:
+; CHECK: ins {{v[0-9]+}}.h[7], {{v[0-9]+}}.h[2]
%tmp3 = extractelement <4 x i16> %tmp1, i32 2
%tmp4 = insertelement <8 x i16> %tmp2, i16 %tmp3, i32 7
ret <8 x i16> %tmp4
}
define <4 x i32> @ins2s4(<2 x i32> %tmp1, <4 x i32> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: ins2s4:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[1]
%tmp3 = extractelement <2 x i32> %tmp1, i32 1
%tmp4 = insertelement <4 x i32> %tmp2, i32 %tmp3, i32 1
ret <4 x i32> %tmp4
}
define <2 x i64> @ins1d2(<1 x i64> %tmp1, <2 x i64> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK-LABEL: ins1d2:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
%tmp3 = extractelement <1 x i64> %tmp1, i32 0
%tmp4 = insertelement <2 x i64> %tmp2, i64 %tmp3, i32 1
ret <2 x i64> %tmp4
}
define <4 x float> @ins2f4(<2 x float> %tmp1, <4 x float> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: ins2f4:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[1]
%tmp3 = extractelement <2 x float> %tmp1, i32 1
%tmp4 = insertelement <4 x float> %tmp2, float %tmp3, i32 1
ret <4 x float> %tmp4
}
define <2 x double> @ins1f2(<1 x double> %tmp1, <2 x double> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK-LABEL: ins1f2:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
%tmp3 = extractelement <1 x double> %tmp1, i32 0
%tmp4 = insertelement <2 x double> %tmp2, double %tmp3, i32 1
ret <2 x double> %tmp4
}
define <8 x i8> @ins16b8(<16 x i8> %tmp1, <8 x i8> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.b[7], {{v[0-9]+}}.b[2]
+; CHECK-LABEL: ins16b8:
+; CHECK: ins {{v[0-9]+}}.b[7], {{v[0-9]+}}.b[2]
%tmp3 = extractelement <16 x i8> %tmp1, i32 2
%tmp4 = insertelement <8 x i8> %tmp2, i8 %tmp3, i32 7
ret <8 x i8> %tmp4
}
define <4 x i16> @ins8h4(<8 x i16> %tmp1, <4 x i16> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.h[3], {{v[0-9]+}}.h[2]
+; CHECK-LABEL: ins8h4:
+; CHECK: ins {{v[0-9]+}}.h[3], {{v[0-9]+}}.h[2]
%tmp3 = extractelement <8 x i16> %tmp1, i32 2
%tmp4 = insertelement <4 x i16> %tmp2, i16 %tmp3, i32 3
ret <4 x i16> %tmp4
}
define <2 x i32> @ins4s2(<4 x i32> %tmp1, <2 x i32> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2]
+; CHECK-LABEL: ins4s2:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2]
%tmp3 = extractelement <4 x i32> %tmp1, i32 2
%tmp4 = insertelement <2 x i32> %tmp2, i32 %tmp3, i32 1
ret <2 x i32> %tmp4
}
define <1 x i64> @ins2d1(<2 x i64> %tmp1, <1 x i64> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[0]
+; CHECK-LABEL: ins2d1:
+; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[0]
%tmp3 = extractelement <2 x i64> %tmp1, i32 0
%tmp4 = insertelement <1 x i64> %tmp2, i64 %tmp3, i32 0
ret <1 x i64> %tmp4
}
define <2 x float> @ins4f2(<4 x float> %tmp1, <2 x float> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2]
+; CHECK-LABEL: ins4f2:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[2]
%tmp3 = extractelement <4 x float> %tmp1, i32 2
%tmp4 = insertelement <2 x float> %tmp2, float %tmp3, i32 1
ret <2 x float> %tmp4
}
define <1 x double> @ins2f1(<2 x double> %tmp1, <1 x double> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[0]
- %tmp3 = extractelement <2 x double> %tmp1, i32 0
+; CHECK-LABEL: ins2f1:
+; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+ %tmp3 = extractelement <2 x double> %tmp1, i32 1
%tmp4 = insertelement <1 x double> %tmp2, double %tmp3, i32 0
ret <1 x double> %tmp4
}
define <8 x i8> @ins8b8(<8 x i8> %tmp1, <8 x i8> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.b[4], {{v[0-9]+}}.b[2]
+; CHECK-LABEL: ins8b8:
+; CHECK: ins {{v[0-9]+}}.b[4], {{v[0-9]+}}.b[2]
%tmp3 = extractelement <8 x i8> %tmp1, i32 2
%tmp4 = insertelement <8 x i8> %tmp2, i8 %tmp3, i32 4
ret <8 x i8> %tmp4
}
define <4 x i16> @ins4h4(<4 x i16> %tmp1, <4 x i16> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.h[3], {{v[0-9]+}}.h[2]
+; CHECK-LABEL: ins4h4:
+; CHECK: ins {{v[0-9]+}}.h[3], {{v[0-9]+}}.h[2]
%tmp3 = extractelement <4 x i16> %tmp1, i32 2
%tmp4 = insertelement <4 x i16> %tmp2, i16 %tmp3, i32 3
ret <4 x i16> %tmp4
}
define <2 x i32> @ins2s2(<2 x i32> %tmp1, <2 x i32> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK-LABEL: ins2s2:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
%tmp3 = extractelement <2 x i32> %tmp1, i32 0
%tmp4 = insertelement <2 x i32> %tmp2, i32 %tmp3, i32 1
ret <2 x i32> %tmp4
}
define <1 x i64> @ins1d1(<1 x i64> %tmp1, <1 x i64> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[0]
+; CHECK-LABEL: ins1d1:
+; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[0]
%tmp3 = extractelement <1 x i64> %tmp1, i32 0
%tmp4 = insertelement <1 x i64> %tmp2, i64 %tmp3, i32 0
ret <1 x i64> %tmp4
}
define <2 x float> @ins2f2(<2 x float> %tmp1, <2 x float> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK-LABEL: ins2f2:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
%tmp3 = extractelement <2 x float> %tmp1, i32 0
%tmp4 = insertelement <2 x float> %tmp2, float %tmp3, i32 1
ret <2 x float> %tmp4
}
define <1 x double> @ins1df1(<1 x double> %tmp1, <1 x double> %tmp2) {
-;CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[0]
+; CHECK-LABEL: ins1df1:
+; CHECK-NOT: ins {{v[0-9]+}}
%tmp3 = extractelement <1 x double> %tmp1, i32 0
%tmp4 = insertelement <1 x double> %tmp2, double %tmp3, i32 0
ret <1 x double> %tmp4
}
define i32 @umovw16b(<16 x i8> %tmp1) {
-;CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.b[8]
+; CHECK-LABEL: umovw16b:
+; CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.b[8]
%tmp3 = extractelement <16 x i8> %tmp1, i32 8
%tmp4 = zext i8 %tmp3 to i32
ret i32 %tmp4
}
define i32 @umovw8h(<8 x i16> %tmp1) {
-;CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.h[2]
+; CHECK-LABEL: umovw8h:
+; CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.h[2]
%tmp3 = extractelement <8 x i16> %tmp1, i32 2
%tmp4 = zext i16 %tmp3 to i32
ret i32 %tmp4
}
define i32 @umovw4s(<4 x i32> %tmp1) {
-;CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.s[2]
+; CHECK-LABEL: umovw4s:
+; CHECK: mov {{w[0-9]+}}, {{v[0-9]+}}.s[2]
%tmp3 = extractelement <4 x i32> %tmp1, i32 2
ret i32 %tmp3
}
define i64 @umovx2d(<2 x i64> %tmp1) {
-;CHECK: umov {{x[0-9]+}}, {{v[0-9]+}}.d[0]
- %tmp3 = extractelement <2 x i64> %tmp1, i32 0
+; CHECK-LABEL: umovx2d:
+; CHECK: mov {{x[0-9]+}}, {{v[0-9]+}}.d[1]
+ %tmp3 = extractelement <2 x i64> %tmp1, i32 1
ret i64 %tmp3
}
define i32 @umovw8b(<8 x i8> %tmp1) {
-;CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.b[7]
+; CHECK-LABEL: umovw8b:
+; CHECK: mov {{w[0-9]+}}, {{v[0-9]+}}.b[7]
%tmp3 = extractelement <8 x i8> %tmp1, i32 7
%tmp4 = zext i8 %tmp3 to i32
ret i32 %tmp4
}
define i32 @umovw4h(<4 x i16> %tmp1) {
-;CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.h[2]
+; CHECK-LABEL: umovw4h:
+; CHECK: mov {{w[0-9]+}}, {{v[0-9]+}}.h[2]
%tmp3 = extractelement <4 x i16> %tmp1, i32 2
%tmp4 = zext i16 %tmp3 to i32
ret i32 %tmp4
}
define i32 @umovw2s(<2 x i32> %tmp1) {
-;CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.s[1]
+; CHECK-LABEL: umovw2s:
+; CHECK: mov {{w[0-9]+}}, {{v[0-9]+}}.s[1]
%tmp3 = extractelement <2 x i32> %tmp1, i32 1
ret i32 %tmp3
}
define i64 @umovx1d(<1 x i64> %tmp1) {
-;CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
+; CHECK-LABEL: umovx1d:
+; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
%tmp3 = extractelement <1 x i64> %tmp1, i32 0
ret i64 %tmp3
}
define i32 @smovw16b(<16 x i8> %tmp1) {
-;CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.b[8]
+; CHECK-LABEL: smovw16b:
+; CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.b[8]
%tmp3 = extractelement <16 x i8> %tmp1, i32 8
%tmp4 = sext i8 %tmp3 to i32
- %tmp5 = add i32 5, %tmp4
+ %tmp5 = add i32 %tmp4, %tmp4
ret i32 %tmp5
}
define i32 @smovw8h(<8 x i16> %tmp1) {
-;CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.h[2]
+; CHECK-LABEL: smovw8h:
+; CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.h[2]
%tmp3 = extractelement <8 x i16> %tmp1, i32 2
%tmp4 = sext i16 %tmp3 to i32
- %tmp5 = add i32 5, %tmp4
+ %tmp5 = add i32 %tmp4, %tmp4
ret i32 %tmp5
}
define i32 @smovx16b(<16 x i8> %tmp1) {
-;CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.b[8]
+; CHECK-LABEL: smovx16b:
+; CHECK: smov {{[xw][0-9]+}}, {{v[0-9]+}}.b[8]
%tmp3 = extractelement <16 x i8> %tmp1, i32 8
%tmp4 = sext i8 %tmp3 to i32
- ret i32 %tmp4
+ %tmp5 = add i32 %tmp4, %tmp4
+ ret i32 %tmp5
}
define i32 @smovx8h(<8 x i16> %tmp1) {
-;CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.h[2]
+; CHECK-LABEL: smovx8h:
+; CHECK: smov {{[xw][0-9]+}}, {{v[0-9]+}}.h[2]
%tmp3 = extractelement <8 x i16> %tmp1, i32 2
%tmp4 = sext i16 %tmp3 to i32
ret i32 %tmp4
}
define i64 @smovx4s(<4 x i32> %tmp1) {
-;CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.s[2]
+; CHECK-LABEL: smovx4s:
+; CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.s[2]
%tmp3 = extractelement <4 x i32> %tmp1, i32 2
%tmp4 = sext i32 %tmp3 to i64
ret i64 %tmp4
}
define i32 @smovw8b(<8 x i8> %tmp1) {
-;CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.b[4]
+; CHECK-LABEL: smovw8b:
+; CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.b[4]
%tmp3 = extractelement <8 x i8> %tmp1, i32 4
%tmp4 = sext i8 %tmp3 to i32
- %tmp5 = add i32 5, %tmp4
+ %tmp5 = add i32 %tmp4, %tmp4
ret i32 %tmp5
}
define i32 @smovw4h(<4 x i16> %tmp1) {
-;CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.h[2]
+; CHECK-LABEL: smovw4h:
+; CHECK: smov {{w[0-9]+}}, {{v[0-9]+}}.h[2]
%tmp3 = extractelement <4 x i16> %tmp1, i32 2
%tmp4 = sext i16 %tmp3 to i32
- %tmp5 = add i32 5, %tmp4
+ %tmp5 = add i32 %tmp4, %tmp4
ret i32 %tmp5
}
define i32 @smovx8b(<8 x i8> %tmp1) {
-;CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.b[6]
+; CHECK-LABEL: smovx8b:
+; CHECK: smov {{[xw][0-9]+}}, {{v[0-9]+}}.b[6]
%tmp3 = extractelement <8 x i8> %tmp1, i32 6
%tmp4 = sext i8 %tmp3 to i32
ret i32 %tmp4
}
define i32 @smovx4h(<4 x i16> %tmp1) {
-;CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.h[2]
+; CHECK-LABEL: smovx4h:
+; CHECK: smov {{[xw][0-9]+}}, {{v[0-9]+}}.h[2]
%tmp3 = extractelement <4 x i16> %tmp1, i32 2
%tmp4 = sext i16 %tmp3 to i32
ret i32 %tmp4
}
define i64 @smovx2s(<2 x i32> %tmp1) {
-;CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.s[1]
+; CHECK-LABEL: smovx2s:
+; CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.s[1]
%tmp3 = extractelement <2 x i32> %tmp1, i32 1
%tmp4 = sext i32 %tmp3 to i64
ret i64 %tmp4
}
define <8 x i8> @test_vcopy_lane_s8(<8 x i8> %v1, <8 x i8> %v2) {
-;CHECK: ins {{v[0-9]+}}.b[5], {{v[0-9]+}}.b[3]
+; CHECK-LABEL: test_vcopy_lane_s8:
+; CHECK: ins {{v[0-9]+}}.b[5], {{v[0-9]+}}.b[3]
%vset_lane = shufflevector <8 x i8> %v1, <8 x i8> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 11, i32 6, i32 7>
ret <8 x i8> %vset_lane
}
define <16 x i8> @test_vcopyq_laneq_s8(<16 x i8> %v1, <16 x i8> %v2) {
-;CHECK: ins {{v[0-9]+}}.b[14], {{v[0-9]+}}.b[6]
+; CHECK-LABEL: test_vcopyq_laneq_s8:
+; CHECK: ins {{v[0-9]+}}.b[14], {{v[0-9]+}}.b[6]
%vset_lane = shufflevector <16 x i8> %v1, <16 x i8> %v2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 22, i32 15>
ret <16 x i8> %vset_lane
}
define <8 x i8> @test_vcopy_lane_swap_s8(<8 x i8> %v1, <8 x i8> %v2) {
-;CHECK: ins {{v[0-9]+}}.b[7], {{v[0-9]+}}.b[0]
+; CHECK-LABEL: test_vcopy_lane_swap_s8:
+; CHECK: ins {{v[0-9]+}}.b[7], {{v[0-9]+}}.b[0]
%vset_lane = shufflevector <8 x i8> %v1, <8 x i8> %v2, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 0>
ret <8 x i8> %vset_lane
}
define <16 x i8> @test_vcopyq_laneq_swap_s8(<16 x i8> %v1, <16 x i8> %v2) {
-;CHECK: ins {{v[0-9]+}}.b[0], {{v[0-9]+}}.b[15]
+; CHECK-LABEL: test_vcopyq_laneq_swap_s8:
+; CHECK: ins {{v[0-9]+}}.b[0], {{v[0-9]+}}.b[15]
%vset_lane = shufflevector <16 x i8> %v1, <16 x i8> %v2, <16 x i32> <i32 15, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
ret <16 x i8> %vset_lane
}
define <8 x i8> @test_vdup_n_u8(i8 %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.8b, {{w[0-9]+}}
+; CHECK-LABEL: test_vdup_n_u8:
+; CHECK: dup {{v[0-9]+}}.8b, {{w[0-9]+}}
%vecinit.i = insertelement <8 x i8> undef, i8 %v1, i32 0
%vecinit1.i = insertelement <8 x i8> %vecinit.i, i8 %v1, i32 1
%vecinit2.i = insertelement <8 x i8> %vecinit1.i, i8 %v1, i32 2
@@ -375,7 +430,8 @@ define <8 x i8> @test_vdup_n_u8(i8 %v1) #0 {
}
define <4 x i16> @test_vdup_n_u16(i16 %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.4h, {{w[0-9]+}}
+; CHECK-LABEL: test_vdup_n_u16:
+; CHECK: dup {{v[0-9]+}}.4h, {{w[0-9]+}}
%vecinit.i = insertelement <4 x i16> undef, i16 %v1, i32 0
%vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %v1, i32 1
%vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %v1, i32 2
@@ -384,20 +440,23 @@ define <4 x i16> @test_vdup_n_u16(i16 %v1) #0 {
}
define <2 x i32> @test_vdup_n_u32(i32 %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.2s, {{w[0-9]+}}
+; CHECK-LABEL: test_vdup_n_u32:
+; CHECK: dup {{v[0-9]+}}.2s, {{w[0-9]+}}
%vecinit.i = insertelement <2 x i32> undef, i32 %v1, i32 0
%vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %v1, i32 1
ret <2 x i32> %vecinit1.i
}
define <1 x i64> @test_vdup_n_u64(i64 %v1) #0 {
-;CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
+; CHECK-LABEL: test_vdup_n_u64:
+; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
%vecinit.i = insertelement <1 x i64> undef, i64 %v1, i32 0
ret <1 x i64> %vecinit.i
}
define <16 x i8> @test_vdupq_n_u8(i8 %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.16b, {{w[0-9]+}}
+; CHECK-LABEL: test_vdupq_n_u8:
+; CHECK: dup {{v[0-9]+}}.16b, {{w[0-9]+}}
%vecinit.i = insertelement <16 x i8> undef, i8 %v1, i32 0
%vecinit1.i = insertelement <16 x i8> %vecinit.i, i8 %v1, i32 1
%vecinit2.i = insertelement <16 x i8> %vecinit1.i, i8 %v1, i32 2
@@ -418,7 +477,8 @@ define <16 x i8> @test_vdupq_n_u8(i8 %v1) #0 {
}
define <8 x i16> @test_vdupq_n_u16(i16 %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.8h, {{w[0-9]+}}
+; CHECK-LABEL: test_vdupq_n_u16:
+; CHECK: dup {{v[0-9]+}}.8h, {{w[0-9]+}}
%vecinit.i = insertelement <8 x i16> undef, i16 %v1, i32 0
%vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %v1, i32 1
%vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %v1, i32 2
@@ -431,7 +491,8 @@ define <8 x i16> @test_vdupq_n_u16(i16 %v1) #0 {
}
define <4 x i32> @test_vdupq_n_u32(i32 %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.4s, {{w[0-9]+}}
+; CHECK-LABEL: test_vdupq_n_u32:
+; CHECK: dup {{v[0-9]+}}.4s, {{w[0-9]+}}
%vecinit.i = insertelement <4 x i32> undef, i32 %v1, i32 0
%vecinit1.i = insertelement <4 x i32> %vecinit.i, i32 %v1, i32 1
%vecinit2.i = insertelement <4 x i32> %vecinit1.i, i32 %v1, i32 2
@@ -440,92 +501,107 @@ define <4 x i32> @test_vdupq_n_u32(i32 %v1) #0 {
}
define <2 x i64> @test_vdupq_n_u64(i64 %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.2d, {{x[0-9]+}}
+; CHECK-LABEL: test_vdupq_n_u64:
+; CHECK: dup {{v[0-9]+}}.2d, {{x[0-9]+}}
%vecinit.i = insertelement <2 x i64> undef, i64 %v1, i32 0
%vecinit1.i = insertelement <2 x i64> %vecinit.i, i64 %v1, i32 1
ret <2 x i64> %vecinit1.i
}
define <8 x i8> @test_vdup_lane_s8(<8 x i8> %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.8b, {{v[0-9]+}}.b[5]
+; CHECK-LABEL: test_vdup_lane_s8:
+; CHECK: dup {{v[0-9]+}}.8b, {{v[0-9]+}}.b[5]
%shuffle = shufflevector <8 x i8> %v1, <8 x i8> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
ret <8 x i8> %shuffle
}
define <4 x i16> @test_vdup_lane_s16(<4 x i16> %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.4h, {{v[0-9]+}}.h[2]
+; CHECK-LABEL: test_vdup_lane_s16:
+; CHECK: dup {{v[0-9]+}}.4h, {{v[0-9]+}}.h[2]
%shuffle = shufflevector <4 x i16> %v1, <4 x i16> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
ret <4 x i16> %shuffle
}
define <2 x i32> @test_vdup_lane_s32(<2 x i32> %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vdup_lane_s32:
+; CHECK: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
%shuffle = shufflevector <2 x i32> %v1, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
ret <2 x i32> %shuffle
}
define <16 x i8> @test_vdupq_lane_s8(<8 x i8> %v1) #0 {
-;CHECK: {{v[0-9]+}}.16b, {{v[0-9]+}}.b[5]
+; CHECK-LABEL: test_vdupq_lane_s8:
+; CHECK: {{v[0-9]+}}.16b, {{v[0-9]+}}.b[5]
%shuffle = shufflevector <8 x i8> %v1, <8 x i8> undef, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
ret <16 x i8> %shuffle
}
define <8 x i16> @test_vdupq_lane_s16(<4 x i16> %v1) #0 {
-;CHECK: {{v[0-9]+}}.8h, {{v[0-9]+}}.h[2]
+; CHECK-LABEL: test_vdupq_lane_s16:
+; CHECK: {{v[0-9]+}}.8h, {{v[0-9]+}}.h[2]
%shuffle = shufflevector <4 x i16> %v1, <4 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
ret <8 x i16> %shuffle
}
define <4 x i32> @test_vdupq_lane_s32(<2 x i32> %v1) #0 {
-;CHECK: {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vdupq_lane_s32:
+; CHECK: {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
%shuffle = shufflevector <2 x i32> %v1, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
ret <4 x i32> %shuffle
}
define <2 x i64> @test_vdupq_lane_s64(<1 x i64> %v1) #0 {
-;CHECK: {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; CHECK-LABEL: test_vdupq_lane_s64:
+; CHECK: {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
%shuffle = shufflevector <1 x i64> %v1, <1 x i64> undef, <2 x i32> zeroinitializer
ret <2 x i64> %shuffle
}
define <8 x i8> @test_vdup_laneq_s8(<16 x i8> %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.8b, {{v[0-9]+}}.b[5]
+; CHECK-LABEL: test_vdup_laneq_s8:
+; CHECK: dup {{v[0-9]+}}.8b, {{v[0-9]+}}.b[5]
%shuffle = shufflevector <16 x i8> %v1, <16 x i8> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
ret <8 x i8> %shuffle
}
define <4 x i16> @test_vdup_laneq_s16(<8 x i16> %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.4h, {{v[0-9]+}}.h[2]
+; CHECK-LABEL: test_vdup_laneq_s16:
+; CHECK: dup {{v[0-9]+}}.4h, {{v[0-9]+}}.h[2]
%shuffle = shufflevector <8 x i16> %v1, <8 x i16> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
ret <4 x i16> %shuffle
}
define <2 x i32> @test_vdup_laneq_s32(<4 x i32> %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vdup_laneq_s32:
+; CHECK: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
%shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
ret <2 x i32> %shuffle
}
define <16 x i8> @test_vdupq_laneq_s8(<16 x i8> %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.16b, {{v[0-9]+}}.b[5]
+; CHECK-LABEL: test_vdupq_laneq_s8:
+; CHECK: dup {{v[0-9]+}}.16b, {{v[0-9]+}}.b[5]
%shuffle = shufflevector <16 x i8> %v1, <16 x i8> undef, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
ret <16 x i8> %shuffle
}
define <8 x i16> @test_vdupq_laneq_s16(<8 x i16> %v1) #0 {
-;CHECK: {{v[0-9]+}}.8h, {{v[0-9]+}}.h[2]
+; CHECK-LABEL: test_vdupq_laneq_s16:
+; CHECK: {{v[0-9]+}}.8h, {{v[0-9]+}}.h[2]
%shuffle = shufflevector <8 x i16> %v1, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
ret <8 x i16> %shuffle
}
define <4 x i32> @test_vdupq_laneq_s32(<4 x i32> %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vdupq_laneq_s32:
+; CHECK: dup {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
%shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
ret <4 x i32> %shuffle
}
define <2 x i64> @test_vdupq_laneq_s64(<2 x i64> %v1) #0 {
-;CHECK: dup {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+; CHECK-LABEL: test_vdupq_laneq_s64:
+; CHECK: dup {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
%shuffle = shufflevector <2 x i64> %v1, <2 x i64> undef, <2 x i32> zeroinitializer
ret <2 x i64> %shuffle
}
@@ -617,7 +693,7 @@ define <1 x double> @test_bitcasti64tov1f64(i64 %in) {
define <1 x i64> @test_bitcastv8i8tov1f64(<8 x i8> %a) #0 {
; CHECK-LABEL: test_bitcastv8i8tov1f64:
; CHECK: neg {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-; CHECK-NEXT: fcvtzs {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK-NEXT: fcvtzs {{[xd][0-9]+}}, {{d[0-9]+}}
%sub.i = sub <8 x i8> zeroinitializer, %a
%1 = bitcast <8 x i8> %sub.i to <1 x double>
%vcvt.i = fptosi <1 x double> %1 to <1 x i64>
@@ -627,7 +703,7 @@ define <1 x i64> @test_bitcastv8i8tov1f64(<8 x i8> %a) #0 {
define <1 x i64> @test_bitcastv4i16tov1f64(<4 x i16> %a) #0 {
; CHECK-LABEL: test_bitcastv4i16tov1f64:
; CHECK: neg {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-; CHECK-NEXT: fcvtzs {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK-NEXT: fcvtzs {{[dx][0-9]+}}, {{d[0-9]+}}
%sub.i = sub <4 x i16> zeroinitializer, %a
%1 = bitcast <4 x i16> %sub.i to <1 x double>
%vcvt.i = fptosi <1 x double> %1 to <1 x i64>
@@ -637,7 +713,7 @@ define <1 x i64> @test_bitcastv4i16tov1f64(<4 x i16> %a) #0 {
define <1 x i64> @test_bitcastv2i32tov1f64(<2 x i32> %a) #0 {
; CHECK-LABEL: test_bitcastv2i32tov1f64:
; CHECK: neg {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-; CHECK-NEXT: fcvtzs {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK-NEXT: fcvtzs {{[xd][0-9]+}}, {{d[0-9]+}}
%sub.i = sub <2 x i32> zeroinitializer, %a
%1 = bitcast <2 x i32> %sub.i to <1 x double>
%vcvt.i = fptosi <1 x double> %1 to <1 x i64>
@@ -647,7 +723,7 @@ define <1 x i64> @test_bitcastv2i32tov1f64(<2 x i32> %a) #0 {
define <1 x i64> @test_bitcastv1i64tov1f64(<1 x i64> %a) #0 {
; CHECK-LABEL: test_bitcastv1i64tov1f64:
; CHECK: neg {{d[0-9]+}}, {{d[0-9]+}}
-; CHECK-NEXT: fcvtzs {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK-NEXT: fcvtzs {{[dx][0-9]+}}, {{d[0-9]+}}
%sub.i = sub <1 x i64> zeroinitializer, %a
%1 = bitcast <1 x i64> %sub.i to <1 x double>
%vcvt.i = fptosi <1 x double> %1 to <1 x i64>
@@ -657,7 +733,7 @@ define <1 x i64> @test_bitcastv1i64tov1f64(<1 x i64> %a) #0 {
define <1 x i64> @test_bitcastv2f32tov1f64(<2 x float> %a) #0 {
; CHECK-LABEL: test_bitcastv2f32tov1f64:
; CHECK: fneg {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-; CHECK-NEXT: fcvtzs {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK-NEXT: fcvtzs {{[xd][0-9]+}}, {{d[0-9]+}}
%sub.i = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %a
%1 = bitcast <2 x float> %sub.i to <1 x double>
%vcvt.i = fptosi <1 x double> %1 to <1 x i64>
@@ -666,7 +742,7 @@ define <1 x i64> @test_bitcastv2f32tov1f64(<2 x float> %a) #0 {
define <8 x i8> @test_bitcastv1f64tov8i8(<1 x i64> %a) #0 {
; CHECK-LABEL: test_bitcastv1f64tov8i8:
-; CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK: scvtf {{d[0-9]+}}, {{[xd][0-9]+}}
; CHECK-NEXT: neg {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%vcvt.i = sitofp <1 x i64> %a to <1 x double>
%1 = bitcast <1 x double> %vcvt.i to <8 x i8>
@@ -676,7 +752,7 @@ define <8 x i8> @test_bitcastv1f64tov8i8(<1 x i64> %a) #0 {
define <4 x i16> @test_bitcastv1f64tov4i16(<1 x i64> %a) #0 {
; CHECK-LABEL: test_bitcastv1f64tov4i16:
-; CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK: scvtf {{d[0-9]+}}, {{[xd][0-9]+}}
; CHECK-NEXT: neg {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
%vcvt.i = sitofp <1 x i64> %a to <1 x double>
%1 = bitcast <1 x double> %vcvt.i to <4 x i16>
@@ -686,7 +762,7 @@ define <4 x i16> @test_bitcastv1f64tov4i16(<1 x i64> %a) #0 {
define <2 x i32> @test_bitcastv1f64tov2i32(<1 x i64> %a) #0 {
; CHECK-LABEL: test_bitcastv1f64tov2i32:
-; CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK: scvtf {{d[0-9]+}}, {{[xd][0-9]+}}
; CHECK-NEXT: neg {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
%vcvt.i = sitofp <1 x i64> %a to <1 x double>
%1 = bitcast <1 x double> %vcvt.i to <2 x i32>
@@ -696,7 +772,7 @@ define <2 x i32> @test_bitcastv1f64tov2i32(<1 x i64> %a) #0 {
define <1 x i64> @test_bitcastv1f64tov1i64(<1 x i64> %a) #0 {
; CHECK-LABEL: test_bitcastv1f64tov1i64:
-; CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK: scvtf {{d[0-9]+}}, {{[xd][0-9]+}}
; CHECK-NEXT: neg {{d[0-9]+}}, {{d[0-9]+}}
%vcvt.i = sitofp <1 x i64> %a to <1 x double>
%1 = bitcast <1 x double> %vcvt.i to <1 x i64>
@@ -706,7 +782,7 @@ define <1 x i64> @test_bitcastv1f64tov1i64(<1 x i64> %a) #0 {
define <2 x float> @test_bitcastv1f64tov2f32(<1 x i64> %a) #0 {
; CHECK-LABEL: test_bitcastv1f64tov2f32:
-; CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK: scvtf {{d[0-9]+}}, {{[xd][0-9]+}}
; CHECK-NEXT: fneg {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
%vcvt.i = sitofp <1 x i64> %a to <1 x double>
%1 = bitcast <1 x double> %vcvt.i to <2 x float>
@@ -717,49 +793,49 @@ define <2 x float> @test_bitcastv1f64tov2f32(<1 x i64> %a) #0 {
; Test insert element into an undef vector
define <8 x i8> @scalar_to_vector.v8i8(i8 %a) {
; CHECK-LABEL: scalar_to_vector.v8i8:
-; CHECK: ins {{v[0-9]+}}.b[0], {{w[0-9]+}}
+; CHECK: fmov {{s[0-9]+}}, {{w[0-9]+}}
%b = insertelement <8 x i8> undef, i8 %a, i32 0
ret <8 x i8> %b
}
define <16 x i8> @scalar_to_vector.v16i8(i8 %a) {
; CHECK-LABEL: scalar_to_vector.v16i8:
-; CHECK: ins {{v[0-9]+}}.b[0], {{w[0-9]+}}
+; CHECK: fmov {{s[0-9]+}}, {{w[0-9]+}}
%b = insertelement <16 x i8> undef, i8 %a, i32 0
ret <16 x i8> %b
}
define <4 x i16> @scalar_to_vector.v4i16(i16 %a) {
; CHECK-LABEL: scalar_to_vector.v4i16:
-; CHECK: ins {{v[0-9]+}}.h[0], {{w[0-9]+}}
+; CHECK: fmov {{s[0-9]+}}, {{w[0-9]+}}
%b = insertelement <4 x i16> undef, i16 %a, i32 0
ret <4 x i16> %b
}
define <8 x i16> @scalar_to_vector.v8i16(i16 %a) {
; CHECK-LABEL: scalar_to_vector.v8i16:
-; CHECK: ins {{v[0-9]+}}.h[0], {{w[0-9]+}}
+; CHECK: fmov {{s[0-9]+}}, {{w[0-9]+}}
%b = insertelement <8 x i16> undef, i16 %a, i32 0
ret <8 x i16> %b
}
define <2 x i32> @scalar_to_vector.v2i32(i32 %a) {
; CHECK-LABEL: scalar_to_vector.v2i32:
-; CHECK: ins {{v[0-9]+}}.s[0], {{w[0-9]+}}
+; CHECK: fmov {{s[0-9]+}}, {{w[0-9]+}}
%b = insertelement <2 x i32> undef, i32 %a, i32 0
ret <2 x i32> %b
}
define <4 x i32> @scalar_to_vector.v4i32(i32 %a) {
; CHECK-LABEL: scalar_to_vector.v4i32:
-; CHECK: ins {{v[0-9]+}}.s[0], {{w[0-9]+}}
+; CHECK: fmov {{s[0-9]+}}, {{w[0-9]+}}
%b = insertelement <4 x i32> undef, i32 %a, i32 0
ret <4 x i32> %b
}
define <2 x i64> @scalar_to_vector.v2i64(i64 %a) {
; CHECK-LABEL: scalar_to_vector.v2i64:
-; CHECK: ins {{v[0-9]+}}.d[0], {{x[0-9]+}}
+; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
%b = insertelement <2 x i64> undef, i64 %a, i32 0
ret <2 x i64> %b
}
@@ -954,7 +1030,7 @@ define <2 x float> @test_scalar_to_vector_f32_to_v2f32(<2 x float> %a) {
; CHECK: fmaxp s{{[0-9]+}}, v{{[0-9]+}}.2s
; CHECK-NEXT: ret
entry:
- %0 = call float @llvm.aarch64.neon.vpmax.f32.v2f32(<2 x float> %a)
+ %0 = call float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float> %a)
%1 = insertelement <1 x float> undef, float %0, i32 0
%2 = extractelement <1 x float> %1, i32 0
%vecinit1.i = insertelement <2 x float> undef, float %2, i32 0
@@ -966,58 +1042,57 @@ define <4 x float> @test_scalar_to_vector_f32_to_v4f32(<2 x float> %a) {
; CHECK: fmaxp s{{[0-9]+}}, v{{[0-9]+}}.2s
; CHECK-NEXT: ret
entry:
- %0 = call float @llvm.aarch64.neon.vpmax.f32.v2f32(<2 x float> %a)
+ %0 = call float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float> %a)
%1 = insertelement <1 x float> undef, float %0, i32 0
%2 = extractelement <1 x float> %1, i32 0
%vecinit1.i = insertelement <4 x float> undef, float %2, i32 0
ret <4 x float> %vecinit1.i
}
-declare float @llvm.aarch64.neon.vpmax.f32.v2f32(<2 x float>)
+declare float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float>)
-define <2 x i32> @test_concat_undef_v1i32(<1 x i32> %a) {
+define <2 x i32> @test_concat_undef_v1i32(<2 x i32> %a) {
; CHECK-LABEL: test_concat_undef_v1i32:
-; CHECK: ins v{{[0-9]+}}.s[1], v{{[0-9]+}}.s[0]
+; CHECK: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
entry:
- %0 = extractelement <1 x i32> %a, i32 0
+ %0 = extractelement <2 x i32> %a, i32 0
%vecinit1.i = insertelement <2 x i32> undef, i32 %0, i32 1
ret <2 x i32> %vecinit1.i
}
-declare <1 x i32> @llvm.arm.neon.vqabs.v1i32(<1 x i32>) #4
+declare i32 @llvm.aarch64.neon.sqabs.i32(i32) #4
-define <2 x i32> @test_concat_v1i32_undef(<1 x i32> %a) {
+define <2 x i32> @test_concat_v1i32_undef(i32 %a) {
; CHECK-LABEL: test_concat_v1i32_undef:
; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}}
; CHECK-NEXT: ret
entry:
- %b = tail call <1 x i32> @llvm.arm.neon.vqabs.v1i32(<1 x i32> %a)
- %0 = extractelement <1 x i32> %b, i32 0
- %vecinit.i432 = insertelement <2 x i32> undef, i32 %0, i32 0
+ %b = tail call i32 @llvm.aarch64.neon.sqabs.i32(i32 %a)
+ %vecinit.i432 = insertelement <2 x i32> undef, i32 %b, i32 0
ret <2 x i32> %vecinit.i432
}
-define <2 x i32> @test_concat_same_v1i32_v1i32(<1 x i32> %a) {
+define <2 x i32> @test_concat_same_v1i32_v1i32(<2 x i32> %a) {
; CHECK-LABEL: test_concat_same_v1i32_v1i32:
; CHECK: dup v{{[0-9]+}}.2s, v{{[0-9]+}}.s[0]
entry:
- %0 = extractelement <1 x i32> %a, i32 0
+ %0 = extractelement <2 x i32> %a, i32 0
%vecinit.i = insertelement <2 x i32> undef, i32 %0, i32 0
%vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %0, i32 1
ret <2 x i32> %vecinit1.i
}
-define <2 x i32> @test_concat_diff_v1i32_v1i32(<1 x i32> %a, <1 x i32> %b) {
+define <2 x i32> @test_concat_diff_v1i32_v1i32(i32 %a, i32 %b) {
; CHECK-LABEL: test_concat_diff_v1i32_v1i32:
; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}}
-; CHECK-NEXT: sqabs s{{[0-9]+}}, s{{[0-9]+}}
-; CHECK-NEXT: ins v0.s[1], v1.s[0]
+; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}}
+; CHECK-NEXT: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
entry:
- %c = tail call <1 x i32> @llvm.arm.neon.vqabs.v1i32(<1 x i32> %a)
- %d = extractelement <1 x i32> %c, i32 0
- %e = tail call <1 x i32> @llvm.arm.neon.vqabs.v1i32(<1 x i32> %b)
- %f = extractelement <1 x i32> %e, i32 0
- %h = shufflevector <1 x i32> %c, <1 x i32> %e, <2 x i32> <i32 0, i32 1>
+ %c = tail call i32 @llvm.aarch64.neon.sqabs.i32(i32 %a)
+ %d = insertelement <2 x i32> undef, i32 %c, i32 0
+ %e = tail call i32 @llvm.aarch64.neon.sqabs.i32(i32 %b)
+ %f = insertelement <2 x i32> undef, i32 %e, i32 0
+ %h = shufflevector <2 x i32> %d, <2 x i32> %f, <2 x i32> <i32 0, i32 2>
ret <2 x i32> %h
}
@@ -1240,20 +1315,13 @@ define <4 x i32> @test_concat_v4i32_v2i32_v2i32(<2 x i32> %x, <2 x i32> %y) #0 {
; CHECK-LABEL: test_concat_v4i32_v2i32_v2i32:
; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
entry:
- %vecext = extractelement <2 x i32> %x, i32 0
- %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
- %vecext1 = extractelement <2 x i32> %x, i32 1
- %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
- %vecext3 = extractelement <2 x i32> %y, i32 0
- %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext3, i32 2
- %vecext5 = extractelement <2 x i32> %y, i32 1
- %vecinit6 = insertelement <4 x i32> %vecinit4, i32 %vecext5, i32 3
+ %vecinit6 = shufflevector <2 x i32> %x, <2 x i32> %y, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
ret <4 x i32> %vecinit6
}
define <2 x i64> @test_concat_v2i64_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y) #0 {
; CHECK-LABEL: test_concat_v2i64_v2i64_v2i64:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
entry:
%vecinit2 = shufflevector <2 x i64> %x, <2 x i64> %y, <2 x i32> <i32 0, i32 2>
ret <2 x i64> %vecinit2
@@ -1261,7 +1329,7 @@ entry:
define <2 x i64> @test_concat_v2i64_v1i64_v2i64(<1 x i64> %x, <2 x i64> %y) #0 {
; CHECK-LABEL: test_concat_v2i64_v1i64_v2i64:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
entry:
%vecext = extractelement <1 x i64> %x, i32 0
%vecinit = insertelement <2 x i64> undef, i64 %vecext, i32 0
@@ -1291,112 +1359,87 @@ entry:
ret <2 x i64> %vecinit2
}
-declare <1 x i8> @llvm.aarch64.neon.vsqadd.v1i8(<1 x i8>, <1 x i8>)
-
-; This case tests the copy of two FPR8 registers, which is implemented by fmov
-; of two FPR32 registers.
-define <1 x i8> @test_copy_FPR8_FPR8(<1 x i8> %a, <1 x i8> %b) {
-; CHECK-LABEL: test_copy_FPR8_FPR8:
-; CHECK: usqadd b1, b0
-; CHECK-NEXT: fmov s0, s1
-entry:
- %vsqadd2.i = call <1 x i8> @llvm.aarch64.neon.vsqadd.v1i8(<1 x i8> %b, <1 x i8> %a)
- ret <1 x i8> %vsqadd2.i
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vsqadd.v1i16(<1 x i16>, <1 x i16>)
-
-define <1 x i16> @test_copy_FPR16_FPR16(<1 x i16> %a, <1 x i16> %b) {
-; CHECK-LABEL: test_copy_FPR16_FPR16:
-; CHECK: usqadd h1, h0
-; CHECK-NEXT: fmov s0, s1
-entry:
- %vsqadd2.i = call <1 x i16> @llvm.aarch64.neon.vsqadd.v1i16(<1 x i16> %b, <1 x i16> %a)
- ret <1 x i16> %vsqadd2.i
-}
define <4 x i16> @concat_vector_v4i16_const() {
; CHECK-LABEL: concat_vector_v4i16_const:
-; CHECK: dup {{v[0-9]+}}.4h, wzr
+; CHECK: movi {{d[0-9]+}}, #0
%r = shufflevector <1 x i16> zeroinitializer, <1 x i16> undef, <4 x i32> zeroinitializer
ret <4 x i16> %r
}
define <4 x i16> @concat_vector_v4i16_const_one() {
; CHECK-LABEL: concat_vector_v4i16_const_one:
-; CHECK: movz {{w[0-9]+}}, #1
-; CHECK: dup {{v[0-9]+}}.4h, {{w[0-9]+}}
+; CHECK: movi {{v[0-9]+}}.4h, #0x1
%r = shufflevector <1 x i16> <i16 1>, <1 x i16> undef, <4 x i32> zeroinitializer
ret <4 x i16> %r
}
define <4 x i32> @concat_vector_v4i32_const() {
; CHECK-LABEL: concat_vector_v4i32_const:
-; CHECK: dup {{v[0-9]+}}.4s, wzr
+; CHECK: movi {{v[0-9]+}}.2d, #0
%r = shufflevector <1 x i32> zeroinitializer, <1 x i32> undef, <4 x i32> zeroinitializer
ret <4 x i32> %r
}
define <8 x i8> @concat_vector_v8i8_const() {
; CHECK-LABEL: concat_vector_v8i8_const:
-; CHECK: dup {{v[0-9]+}}.8b, wzr
+; CHECK: movi {{d[0-9]+}}, #0
%r = shufflevector <1 x i8> zeroinitializer, <1 x i8> undef, <8 x i32> zeroinitializer
ret <8 x i8> %r
}
define <8 x i16> @concat_vector_v8i16_const() {
; CHECK-LABEL: concat_vector_v8i16_const:
-; CHECK: dup {{v[0-9]+}}.8h, wzr
+; CHECK: movi {{v[0-9]+}}.2d, #0
%r = shufflevector <1 x i16> zeroinitializer, <1 x i16> undef, <8 x i32> zeroinitializer
ret <8 x i16> %r
}
define <8 x i16> @concat_vector_v8i16_const_one() {
; CHECK-LABEL: concat_vector_v8i16_const_one:
-; CHECK: movz {{w[0-9]+}}, #1
-; CHECK: dup {{v[0-9]+}}.8h, {{w[0-9]+}}
+; CHECK: movi {{v[0-9]+}}.8h, #0x1
%r = shufflevector <1 x i16> <i16 1>, <1 x i16> undef, <8 x i32> zeroinitializer
ret <8 x i16> %r
}
define <16 x i8> @concat_vector_v16i8_const() {
; CHECK-LABEL: concat_vector_v16i8_const:
-; CHECK: dup {{v[0-9]+}}.16b, wzr
+; CHECK: movi {{v[0-9]+}}.2d, #0
%r = shufflevector <1 x i8> zeroinitializer, <1 x i8> undef, <16 x i32> zeroinitializer
ret <16 x i8> %r
}
define <4 x i16> @concat_vector_v4i16(<1 x i16> %a) {
; CHECK-LABEL: concat_vector_v4i16:
-; CHECK: dup {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+; CHECK: dup {{v[0-9]+}}.4h, {{w[0-9]+}}
%r = shufflevector <1 x i16> %a, <1 x i16> undef, <4 x i32> zeroinitializer
ret <4 x i16> %r
}
define <4 x i32> @concat_vector_v4i32(<1 x i32> %a) {
; CHECK-LABEL: concat_vector_v4i32:
-; CHECK: dup {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+; CHECK: dup {{v[0-9]+}}.4s, {{w[0-9]+}}
%r = shufflevector <1 x i32> %a, <1 x i32> undef, <4 x i32> zeroinitializer
ret <4 x i32> %r
}
define <8 x i8> @concat_vector_v8i8(<1 x i8> %a) {
; CHECK-LABEL: concat_vector_v8i8:
-; CHECK: dup {{v[0-9]+}}.8b, {{v[0-9]+}}.b[0]
+; CHECK: dup {{v[0-9]+}}.8b, {{w[0-9]+}}
%r = shufflevector <1 x i8> %a, <1 x i8> undef, <8 x i32> zeroinitializer
ret <8 x i8> %r
}
define <8 x i16> @concat_vector_v8i16(<1 x i16> %a) {
; CHECK-LABEL: concat_vector_v8i16:
-; CHECK: dup {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+; CHECK: dup {{v[0-9]+}}.8h, {{w[0-9]+}}
%r = shufflevector <1 x i16> %a, <1 x i16> undef, <8 x i32> zeroinitializer
ret <8 x i16> %r
}
define <16 x i8> @concat_vector_v16i8(<1 x i8> %a) {
; CHECK-LABEL: concat_vector_v16i8:
-; CHECK: dup {{v[0-9]+}}.16b, {{v[0-9]+}}.b[0]
+; CHECK: dup {{v[0-9]+}}.16b, {{w[0-9]+}}
%r = shufflevector <1 x i8> %a, <1 x i8> undef, <16 x i32> zeroinitializer
ret <16 x i8> %r
}
diff --git a/test/CodeGen/AArch64/arm64-neon-copyPhysReg-tuple.ll b/test/CodeGen/AArch64/arm64-neon-copyPhysReg-tuple.ll
new file mode 100644
index 0000000..276ac13
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-copyPhysReg-tuple.ll
@@ -0,0 +1,48 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
+; arm64 has a separate copy due to intrinsics
+
+define <4 x i32> @copyTuple.QPair(i32* %a, i32* %b) {
+; CHECK-LABEL: copyTuple.QPair:
+; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: ld2 { {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x{{[0-9]+|sp}}]
+entry:
+ %vld = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i32(<4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 2, i32 2, i32 2, i32 2>, i64 1, i32* %a)
+ %extract = extractvalue { <4 x i32>, <4 x i32> } %vld, 0
+ %vld1 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i32(<4 x i32> %extract, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, i64 1, i32* %b)
+ %vld1.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld1, 0
+ ret <4 x i32> %vld1.fca.0.extract
+}
+
+define <4 x i32> @copyTuple.QTriple(i32* %a, i32* %b, <4 x i32> %c) {
+; CHECK-LABEL: copyTuple.QTriple:
+; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: ld3 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x{{[0-9]+|sp}}]
+entry:
+ %vld = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i32(<4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, <4 x i32> %c, i64 1, i32* %a)
+ %extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld, 0
+ %vld1 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i32(<4 x i32> %extract, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, i64 1, i32* %b)
+ %vld1.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld1, 0
+ ret <4 x i32> %vld1.fca.0.extract
+}
+
+define <4 x i32> @copyTuple.QQuad(i32* %a, i32* %b, <4 x i32> %c) {
+; CHECK-LABEL: copyTuple.QQuad:
+; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: ld4 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x{{[0-9]+|sp}}]
+entry:
+ %vld = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i32(<4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, <4 x i32> %c, <4 x i32> %c, i64 1, i32* %a)
+ %extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld, 0
+ %vld1 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i32(<4 x i32> %extract, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, <4 x i32> %c, i64 1, i32* %b)
+ %vld1.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld1, 0
+ ret <4 x i32> %vld1.fca.0.extract
+}
+
+declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*)
+declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*)
+declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*)
diff --git a/test/CodeGen/AArch64/arm64-neon-mul-div.ll b/test/CodeGen/AArch64/arm64-neon-mul-div.ll
new file mode 100644
index 0000000..720f3eb
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-mul-div.ll
@@ -0,0 +1,797 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
+; arm64 has its own copy of this because of the intrinsics
+
+define <8 x i8> @mul8xi8(<8 x i8> %A, <8 x i8> %B) {
+; CHECK-LABEL: mul8xi8:
+; CHECK: mul {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+ %tmp3 = mul <8 x i8> %A, %B;
+ ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @mul16xi8(<16 x i8> %A, <16 x i8> %B) {
+; CHECK-LABEL: mul16xi8:
+; CHECK: mul {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = mul <16 x i8> %A, %B;
+ ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @mul4xi16(<4 x i16> %A, <4 x i16> %B) {
+; CHECK-LABEL: mul4xi16:
+; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+ %tmp3 = mul <4 x i16> %A, %B;
+ ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @mul8xi16(<8 x i16> %A, <8 x i16> %B) {
+; CHECK-LABEL: mul8xi16:
+; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+ %tmp3 = mul <8 x i16> %A, %B;
+ ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @mul2xi32(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: mul2xi32:
+; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+ %tmp3 = mul <2 x i32> %A, %B;
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @mul4x32(<4 x i32> %A, <4 x i32> %B) {
+; CHECK-LABEL: mul4x32:
+; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+ %tmp3 = mul <4 x i32> %A, %B;
+ ret <4 x i32> %tmp3
+}
+
+define <1 x i64> @mul1xi64(<1 x i64> %A, <1 x i64> %B) {
+; CHECK-LABEL: mul1xi64:
+; CHECK: mul x{{[0-9]+}}, x{{[0-9]+}}, x{{[0-9]+}}
+ %tmp3 = mul <1 x i64> %A, %B;
+ ret <1 x i64> %tmp3
+}
+
+define <2 x i64> @mul2xi64(<2 x i64> %A, <2 x i64> %B) {
+; CHECK-LABEL: mul2xi64:
+; CHECK: mul x{{[0-9]+}}, x{{[0-9]+}}, x{{[0-9]+}}
+; CHECK: mul x{{[0-9]+}}, x{{[0-9]+}}, x{{[0-9]+}}
+ %tmp3 = mul <2 x i64> %A, %B;
+ ret <2 x i64> %tmp3
+}
+
+ define <2 x float> @mul2xfloat(<2 x float> %A, <2 x float> %B) {
+; CHECK-LABEL: mul2xfloat:
+; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+ %tmp3 = fmul <2 x float> %A, %B;
+ ret <2 x float> %tmp3
+}
+
+define <4 x float> @mul4xfloat(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: mul4xfloat:
+; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+ %tmp3 = fmul <4 x float> %A, %B;
+ ret <4 x float> %tmp3
+}
+define <2 x double> @mul2xdouble(<2 x double> %A, <2 x double> %B) {
+; CHECK-LABEL: mul2xdouble:
+; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+ %tmp3 = fmul <2 x double> %A, %B;
+ ret <2 x double> %tmp3
+}
+
+
+ define <2 x float> @div2xfloat(<2 x float> %A, <2 x float> %B) {
+; CHECK-LABEL: div2xfloat:
+; CHECK: fdiv {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+ %tmp3 = fdiv <2 x float> %A, %B;
+ ret <2 x float> %tmp3
+}
+
+define <4 x float> @div4xfloat(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: div4xfloat:
+; CHECK: fdiv {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+ %tmp3 = fdiv <4 x float> %A, %B;
+ ret <4 x float> %tmp3
+}
+define <2 x double> @div2xdouble(<2 x double> %A, <2 x double> %B) {
+; CHECK-LABEL: div2xdouble:
+; CHECK: fdiv {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+ %tmp3 = fdiv <2 x double> %A, %B;
+ ret <2 x double> %tmp3
+}
+
+define <1 x i8> @sdiv1x8(<1 x i8> %A, <1 x i8> %B) {
+; CHECK-LABEL: sdiv1x8:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+ %tmp3 = sdiv <1 x i8> %A, %B;
+ ret <1 x i8> %tmp3
+}
+
+define <8 x i8> @sdiv8x8(<8 x i8> %A, <8 x i8> %B) {
+; CHECK-LABEL: sdiv8x8:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+ %tmp3 = sdiv <8 x i8> %A, %B;
+ ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @sdiv16x8(<16 x i8> %A, <16 x i8> %B) {
+; CHECK-LABEL: sdiv16x8:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+ %tmp3 = sdiv <16 x i8> %A, %B;
+ ret <16 x i8> %tmp3
+}
+
+define <1 x i16> @sdiv1x16(<1 x i16> %A, <1 x i16> %B) {
+; CHECK-LABEL: sdiv1x16:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+ %tmp3 = sdiv <1 x i16> %A, %B;
+ ret <1 x i16> %tmp3
+}
+
+define <4 x i16> @sdiv4x16(<4 x i16> %A, <4 x i16> %B) {
+; CHECK-LABEL: sdiv4x16:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+ %tmp3 = sdiv <4 x i16> %A, %B;
+ ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @sdiv8x16(<8 x i16> %A, <8 x i16> %B) {
+; CHECK-LABEL: sdiv8x16:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+ %tmp3 = sdiv <8 x i16> %A, %B;
+ ret <8 x i16> %tmp3
+}
+
+define <1 x i32> @sdiv1x32(<1 x i32> %A, <1 x i32> %B) {
+; CHECK-LABEL: sdiv1x32:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+ %tmp3 = sdiv <1 x i32> %A, %B;
+ ret <1 x i32> %tmp3
+}
+
+define <2 x i32> @sdiv2x32(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: sdiv2x32:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+ %tmp3 = sdiv <2 x i32> %A, %B;
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @sdiv4x32(<4 x i32> %A, <4 x i32> %B) {
+; CHECK-LABEL: sdiv4x32:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+ %tmp3 = sdiv <4 x i32> %A, %B;
+ ret <4 x i32> %tmp3
+}
+
+define <1 x i64> @sdiv1x64(<1 x i64> %A, <1 x i64> %B) {
+; CHECK-LABEL: sdiv1x64:
+; CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+ %tmp3 = sdiv <1 x i64> %A, %B;
+ ret <1 x i64> %tmp3
+}
+
+define <2 x i64> @sdiv2x64(<2 x i64> %A, <2 x i64> %B) {
+; CHECK-LABEL: sdiv2x64:
+; CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+ %tmp3 = sdiv <2 x i64> %A, %B;
+ ret <2 x i64> %tmp3
+}
+
+define <1 x i8> @udiv1x8(<1 x i8> %A, <1 x i8> %B) {
+; CHECK-LABEL: udiv1x8:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+ %tmp3 = udiv <1 x i8> %A, %B;
+ ret <1 x i8> %tmp3
+}
+
+define <8 x i8> @udiv8x8(<8 x i8> %A, <8 x i8> %B) {
+; CHECK-LABEL: udiv8x8:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+ %tmp3 = udiv <8 x i8> %A, %B;
+ ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @udiv16x8(<16 x i8> %A, <16 x i8> %B) {
+; CHECK-LABEL: udiv16x8:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+ %tmp3 = udiv <16 x i8> %A, %B;
+ ret <16 x i8> %tmp3
+}
+
+define <1 x i16> @udiv1x16(<1 x i16> %A, <1 x i16> %B) {
+; CHECK-LABEL: udiv1x16:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+ %tmp3 = udiv <1 x i16> %A, %B;
+ ret <1 x i16> %tmp3
+}
+
+define <4 x i16> @udiv4x16(<4 x i16> %A, <4 x i16> %B) {
+; CHECK-LABEL: udiv4x16:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+ %tmp3 = udiv <4 x i16> %A, %B;
+ ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @udiv8x16(<8 x i16> %A, <8 x i16> %B) {
+; CHECK-LABEL: udiv8x16:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+ %tmp3 = udiv <8 x i16> %A, %B;
+ ret <8 x i16> %tmp3
+}
+
+define <1 x i32> @udiv1x32(<1 x i32> %A, <1 x i32> %B) {
+; CHECK-LABEL: udiv1x32:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+ %tmp3 = udiv <1 x i32> %A, %B;
+ ret <1 x i32> %tmp3
+}
+
+define <2 x i32> @udiv2x32(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: udiv2x32:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+ %tmp3 = udiv <2 x i32> %A, %B;
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @udiv4x32(<4 x i32> %A, <4 x i32> %B) {
+; CHECK-LABEL: udiv4x32:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+ %tmp3 = udiv <4 x i32> %A, %B;
+ ret <4 x i32> %tmp3
+}
+
+define <1 x i64> @udiv1x64(<1 x i64> %A, <1 x i64> %B) {
+; CHECK-LABEL: udiv1x64:
+; CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+ %tmp3 = udiv <1 x i64> %A, %B;
+ ret <1 x i64> %tmp3
+}
+
+define <2 x i64> @udiv2x64(<2 x i64> %A, <2 x i64> %B) {
+; CHECK-LABEL: udiv2x64:
+; CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+ %tmp3 = udiv <2 x i64> %A, %B;
+ ret <2 x i64> %tmp3
+}
+
+define <1 x i8> @srem1x8(<1 x i8> %A, <1 x i8> %B) {
+; CHECK-LABEL: srem1x8:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+ %tmp3 = srem <1 x i8> %A, %B;
+ ret <1 x i8> %tmp3
+}
+
+define <8 x i8> @srem8x8(<8 x i8> %A, <8 x i8> %B) {
+; CHECK-LABEL: srem8x8:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+ %tmp3 = srem <8 x i8> %A, %B;
+ ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @srem16x8(<16 x i8> %A, <16 x i8> %B) {
+; CHECK-LABEL: srem16x8:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+ %tmp3 = srem <16 x i8> %A, %B;
+ ret <16 x i8> %tmp3
+}
+
+define <1 x i16> @srem1x16(<1 x i16> %A, <1 x i16> %B) {
+; CHECK-LABEL: srem1x16:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+ %tmp3 = srem <1 x i16> %A, %B;
+ ret <1 x i16> %tmp3
+}
+
+define <4 x i16> @srem4x16(<4 x i16> %A, <4 x i16> %B) {
+; CHECK-LABEL: srem4x16:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+ %tmp3 = srem <4 x i16> %A, %B;
+ ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @srem8x16(<8 x i16> %A, <8 x i16> %B) {
+; CHECK-LABEL: srem8x16:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+ %tmp3 = srem <8 x i16> %A, %B;
+ ret <8 x i16> %tmp3
+}
+
+define <1 x i32> @srem1x32(<1 x i32> %A, <1 x i32> %B) {
+; CHECK-LABEL: srem1x32:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+ %tmp3 = srem <1 x i32> %A, %B;
+ ret <1 x i32> %tmp3
+}
+
+define <2 x i32> @srem2x32(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: srem2x32:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+ %tmp3 = srem <2 x i32> %A, %B;
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @srem4x32(<4 x i32> %A, <4 x i32> %B) {
+; CHECK-LABEL: srem4x32:
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+ %tmp3 = srem <4 x i32> %A, %B;
+ ret <4 x i32> %tmp3
+}
+
+define <1 x i64> @srem1x64(<1 x i64> %A, <1 x i64> %B) {
+; CHECK-LABEL: srem1x64:
+; CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+ %tmp3 = srem <1 x i64> %A, %B;
+ ret <1 x i64> %tmp3
+}
+
+define <2 x i64> @srem2x64(<2 x i64> %A, <2 x i64> %B) {
+; CHECK-LABEL: srem2x64:
+; CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+ %tmp3 = srem <2 x i64> %A, %B;
+ ret <2 x i64> %tmp3
+}
+
+define <1 x i8> @urem1x8(<1 x i8> %A, <1 x i8> %B) {
+; CHECK-LABEL: urem1x8:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+ %tmp3 = urem <1 x i8> %A, %B;
+ ret <1 x i8> %tmp3
+}
+
+define <8 x i8> @urem8x8(<8 x i8> %A, <8 x i8> %B) {
+; CHECK-LABEL: urem8x8:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+ %tmp3 = urem <8 x i8> %A, %B;
+ ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @urem16x8(<16 x i8> %A, <16 x i8> %B) {
+; CHECK-LABEL: urem16x8:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+ %tmp3 = urem <16 x i8> %A, %B;
+ ret <16 x i8> %tmp3
+}
+
+define <1 x i16> @urem1x16(<1 x i16> %A, <1 x i16> %B) {
+; CHECK-LABEL: urem1x16:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+ %tmp3 = urem <1 x i16> %A, %B;
+ ret <1 x i16> %tmp3
+}
+
+define <4 x i16> @urem4x16(<4 x i16> %A, <4 x i16> %B) {
+; CHECK-LABEL: urem4x16:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+ %tmp3 = urem <4 x i16> %A, %B;
+ ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @urem8x16(<8 x i16> %A, <8 x i16> %B) {
+; CHECK-LABEL: urem8x16:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+ %tmp3 = urem <8 x i16> %A, %B;
+ ret <8 x i16> %tmp3
+}
+
+define <1 x i32> @urem1x32(<1 x i32> %A, <1 x i32> %B) {
+; CHECK-LABEL: urem1x32:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+ %tmp3 = urem <1 x i32> %A, %B;
+ ret <1 x i32> %tmp3
+}
+
+define <2 x i32> @urem2x32(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: urem2x32:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+ %tmp3 = urem <2 x i32> %A, %B;
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @urem4x32(<4 x i32> %A, <4 x i32> %B) {
+; CHECK-LABEL: urem4x32:
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+ %tmp3 = urem <4 x i32> %A, %B;
+ ret <4 x i32> %tmp3
+}
+
+define <1 x i64> @urem1x64(<1 x i64> %A, <1 x i64> %B) {
+; CHECK-LABEL: urem1x64:
+; CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+ %tmp3 = urem <1 x i64> %A, %B;
+ ret <1 x i64> %tmp3
+}
+
+define <2 x i64> @urem2x64(<2 x i64> %A, <2 x i64> %B) {
+; CHECK-LABEL: urem2x64:
+; CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+ %tmp3 = urem <2 x i64> %A, %B;
+ ret <2 x i64> %tmp3
+}
+
+define <2 x float> @frem2f32(<2 x float> %A, <2 x float> %B) {
+; CHECK-LABEL: frem2f32:
+; CHECK: bl fmodf
+; CHECK: bl fmodf
+ %tmp3 = frem <2 x float> %A, %B;
+ ret <2 x float> %tmp3
+}
+
+define <4 x float> @frem4f32(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: frem4f32:
+; CHECK: bl fmodf
+; CHECK: bl fmodf
+; CHECK: bl fmodf
+; CHECK: bl fmodf
+ %tmp3 = frem <4 x float> %A, %B;
+ ret <4 x float> %tmp3
+}
+
+define <1 x double> @frem1d64(<1 x double> %A, <1 x double> %B) {
+; CHECK-LABEL: frem1d64:
+; CHECK: bl fmod
+ %tmp3 = frem <1 x double> %A, %B;
+ ret <1 x double> %tmp3
+}
+
+define <2 x double> @frem2d64(<2 x double> %A, <2 x double> %B) {
+; CHECK-LABEL: frem2d64:
+; CHECK: bl fmod
+; CHECK: bl fmod
+ %tmp3 = frem <2 x double> %A, %B;
+ ret <2 x double> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.pmul.v8i8(<8 x i8>, <8 x i8>)
+declare <16 x i8> @llvm.aarch64.neon.pmul.v16i8(<16 x i8>, <16 x i8>)
+
+define <8 x i8> @poly_mulv8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK-LABEL: poly_mulv8i8:
+ %prod = call <8 x i8> @llvm.aarch64.neon.pmul.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: pmul v0.8b, v0.8b, v1.8b
+ ret <8 x i8> %prod
+}
+
+define <16 x i8> @poly_mulv16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK-LABEL: poly_mulv16i8:
+ %prod = call <16 x i8> @llvm.aarch64.neon.pmul.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: pmul v0.16b, v0.16b, v1.16b
+ ret <16 x i8> %prod
+}
+
+declare <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>)
+declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i16> @test_sqdmulh_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK-LABEL: test_sqdmulh_v4i16:
+ %prod = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: sqdmulh v0.4h, v0.4h, v1.4h
+ ret <4 x i16> %prod
+}
+
+define <8 x i16> @test_sqdmulh_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK-LABEL: test_sqdmulh_v8i16:
+ %prod = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: sqdmulh v0.8h, v0.8h, v1.8h
+ ret <8 x i16> %prod
+}
+
+define <2 x i32> @test_sqdmulh_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK-LABEL: test_sqdmulh_v2i32:
+ %prod = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: sqdmulh v0.2s, v0.2s, v1.2s
+ ret <2 x i32> %prod
+}
+
+define <4 x i32> @test_sqdmulh_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: test_sqdmulh_v4i32:
+ %prod = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: sqdmulh v0.4s, v0.4s, v1.4s
+ ret <4 x i32> %prod
+}
+
+declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>)
+declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i16> @test_sqrdmulh_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK-LABEL: test_sqrdmulh_v4i16:
+ %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: sqrdmulh v0.4h, v0.4h, v1.4h
+ ret <4 x i16> %prod
+}
+
+define <8 x i16> @test_sqrdmulh_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK-LABEL: test_sqrdmulh_v8i16:
+ %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: sqrdmulh v0.8h, v0.8h, v1.8h
+ ret <8 x i16> %prod
+}
+
+define <2 x i32> @test_sqrdmulh_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK-LABEL: test_sqrdmulh_v2i32:
+ %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: sqrdmulh v0.2s, v0.2s, v1.2s
+ ret <2 x i32> %prod
+}
+
+define <4 x i32> @test_sqrdmulh_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: test_sqrdmulh_v4i32:
+ %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: sqrdmulh v0.4s, v0.4s, v1.4s
+ ret <4 x i32> %prod
+}
+
+declare <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double>, <2 x double>)
+
+define <2 x float> @fmulx_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
+; CHECK-LABEL: fmulx_v2f32:
+; Using registers other than v0, v1 and v2 are possible, but would be odd.
+; CHECK: fmulx v0.2s, v0.2s, v1.2s
+ %val = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %lhs, <2 x float> %rhs)
+ ret <2 x float> %val
+}
+
+define <4 x float> @fmulx_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
+; CHECK-LABEL: fmulx_v4f32:
+; Using registers other than v0, v1 and v2 are possible, but would be odd.
+; CHECK: fmulx v0.4s, v0.4s, v1.4s
+ %val = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %lhs, <4 x float> %rhs)
+ ret <4 x float> %val
+}
+
+define <2 x double> @fmulx_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
+; CHECK-LABEL: fmulx_v2f64:
+; Using registers other than v0, v1 and v2 are possible, but would be odd.
+; CHECK: fmulx v0.2d, v0.2d, v1.2d
+ %val = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %lhs, <2 x double> %rhs)
+ ret <2 x double> %val
+}
+
diff --git a/test/CodeGen/AArch64/neon-scalar-by-elem-mul.ll b/test/CodeGen/AArch64/arm64-neon-scalar-by-elem-mul.ll
index c9128e7..92ed239 100644
--- a/test/CodeGen/AArch64/neon-scalar-by-elem-mul.ll
+++ b/test/CodeGen/AArch64/arm64-neon-scalar-by-elem-mul.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
define float @test_fmul_lane_ss2S(float %a, <2 x float> %v) {
- ; CHECK: test_fmul_lane_ss2S
+ ; CHECK-LABEL: test_fmul_lane_ss2S
; CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
%tmp1 = extractelement <2 x float> %v, i32 1
%tmp2 = fmul float %a, %tmp1;
@@ -9,7 +9,7 @@ define float @test_fmul_lane_ss2S(float %a, <2 x float> %v) {
}
define float @test_fmul_lane_ss2S_swap(float %a, <2 x float> %v) {
- ; CHECK: test_fmul_lane_ss2S_swap
+ ; CHECK-LABEL: test_fmul_lane_ss2S_swap
; CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
%tmp1 = extractelement <2 x float> %v, i32 1
%tmp2 = fmul float %tmp1, %a;
@@ -18,7 +18,7 @@ define float @test_fmul_lane_ss2S_swap(float %a, <2 x float> %v) {
define float @test_fmul_lane_ss4S(float %a, <4 x float> %v) {
- ; CHECK: test_fmul_lane_ss4S
+ ; CHECK-LABEL: test_fmul_lane_ss4S
; CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
%tmp1 = extractelement <4 x float> %v, i32 3
%tmp2 = fmul float %a, %tmp1;
@@ -26,7 +26,7 @@ define float @test_fmul_lane_ss4S(float %a, <4 x float> %v) {
}
define float @test_fmul_lane_ss4S_swap(float %a, <4 x float> %v) {
- ; CHECK: test_fmul_lane_ss4S_swap
+ ; CHECK-LABEL: test_fmul_lane_ss4S_swap
; CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
%tmp1 = extractelement <4 x float> %v, i32 3
%tmp2 = fmul float %tmp1, %a;
@@ -35,8 +35,8 @@ define float @test_fmul_lane_ss4S_swap(float %a, <4 x float> %v) {
define double @test_fmul_lane_ddD(double %a, <1 x double> %v) {
- ; CHECK: test_fmul_lane_ddD
- ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
+ ; CHECK-LABEL: test_fmul_lane_ddD
+ ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0]|d[0-9]+}}
%tmp1 = extractelement <1 x double> %v, i32 0
%tmp2 = fmul double %a, %tmp1;
ret double %tmp2;
@@ -45,7 +45,7 @@ define double @test_fmul_lane_ddD(double %a, <1 x double> %v) {
define double @test_fmul_lane_dd2D(double %a, <2 x double> %v) {
- ; CHECK: test_fmul_lane_dd2D
+ ; CHECK-LABEL: test_fmul_lane_dd2D
; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
%tmp1 = extractelement <2 x double> %v, i32 1
%tmp2 = fmul double %a, %tmp1;
@@ -54,71 +54,71 @@ define double @test_fmul_lane_dd2D(double %a, <2 x double> %v) {
define double @test_fmul_lane_dd2D_swap(double %a, <2 x double> %v) {
- ; CHECK: test_fmul_lane_dd2D_swap
+ ; CHECK-LABEL: test_fmul_lane_dd2D_swap
; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
%tmp1 = extractelement <2 x double> %v, i32 1
%tmp2 = fmul double %tmp1, %a;
ret double %tmp2;
}
-declare float @llvm.aarch64.neon.vmulx.f32(float, float)
+declare float @llvm.aarch64.neon.fmulx.f32(float, float)
define float @test_fmulx_lane_f32(float %a, <2 x float> %v) {
- ; CHECK: test_fmulx_lane_f32
+ ; CHECK-LABEL: test_fmulx_lane_f32
; CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
%tmp1 = extractelement <2 x float> %v, i32 1
- %tmp2 = call float @llvm.aarch64.neon.vmulx.f32(float %a, float %tmp1)
+ %tmp2 = call float @llvm.aarch64.neon.fmulx.f32(float %a, float %tmp1)
ret float %tmp2;
}
define float @test_fmulx_laneq_f32(float %a, <4 x float> %v) {
- ; CHECK: test_fmulx_laneq_f32
+ ; CHECK-LABEL: test_fmulx_laneq_f32
; CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
%tmp1 = extractelement <4 x float> %v, i32 3
- %tmp2 = call float @llvm.aarch64.neon.vmulx.f32(float %a, float %tmp1)
+ %tmp2 = call float @llvm.aarch64.neon.fmulx.f32(float %a, float %tmp1)
ret float %tmp2;
}
define float @test_fmulx_laneq_f32_swap(float %a, <4 x float> %v) {
- ; CHECK: test_fmulx_laneq_f32_swap
+ ; CHECK-LABEL: test_fmulx_laneq_f32_swap
; CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
%tmp1 = extractelement <4 x float> %v, i32 3
- %tmp2 = call float @llvm.aarch64.neon.vmulx.f32(float %tmp1, float %a)
+ %tmp2 = call float @llvm.aarch64.neon.fmulx.f32(float %tmp1, float %a)
ret float %tmp2;
}
-declare double @llvm.aarch64.neon.vmulx.f64(double, double)
+declare double @llvm.aarch64.neon.fmulx.f64(double, double)
define double @test_fmulx_lane_f64(double %a, <1 x double> %v) {
- ; CHECK: test_fmulx_lane_f64
- ; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
+ ; CHECK-LABEL: test_fmulx_lane_f64
+ ; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0]|d[0-9]+}}
%tmp1 = extractelement <1 x double> %v, i32 0
- %tmp2 = call double @llvm.aarch64.neon.vmulx.f64(double %a, double %tmp1)
+ %tmp2 = call double @llvm.aarch64.neon.fmulx.f64(double %a, double %tmp1)
ret double %tmp2;
}
define double @test_fmulx_laneq_f64_0(double %a, <2 x double> %v) {
- ; CHECK: test_fmulx_laneq_f64_0
+ ; CHECK-LABEL: test_fmulx_laneq_f64_0
; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
%tmp1 = extractelement <2 x double> %v, i32 0
- %tmp2 = call double @llvm.aarch64.neon.vmulx.f64(double %a, double %tmp1)
+ %tmp2 = call double @llvm.aarch64.neon.fmulx.f64(double %a, double %tmp1)
ret double %tmp2;
}
define double @test_fmulx_laneq_f64_1(double %a, <2 x double> %v) {
- ; CHECK: test_fmulx_laneq_f64_1
+ ; CHECK-LABEL: test_fmulx_laneq_f64_1
; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
%tmp1 = extractelement <2 x double> %v, i32 1
- %tmp2 = call double @llvm.aarch64.neon.vmulx.f64(double %a, double %tmp1)
+ %tmp2 = call double @llvm.aarch64.neon.fmulx.f64(double %a, double %tmp1)
ret double %tmp2;
}
define double @test_fmulx_laneq_f64_1_swap(double %a, <2 x double> %v) {
- ; CHECK: test_fmulx_laneq_f64_1_swap
+ ; CHECK-LABEL: test_fmulx_laneq_f64_1_swap
; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
%tmp1 = extractelement <2 x double> %v, i32 1
- %tmp2 = call double @llvm.aarch64.neon.vmulx.f64(double %tmp1, double %a)
+ %tmp2 = call double @llvm.aarch64.neon.fmulx.f64(double %tmp1, double %a)
ret double %tmp2;
}
diff --git a/test/CodeGen/AArch64/neon-select_cc.ll b/test/CodeGen/AArch64/arm64-neon-select_cc.ll
index f6b5d3c..255b90d 100644
--- a/test/CodeGen/AArch64/neon-select_cc.ll
+++ b/test/CodeGen/AArch64/arm64-neon-select_cc.ll
@@ -1,12 +1,12 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
define <8x i8> @test_select_cc_v8i8_i8(i8 %a, i8 %b, <8x i8> %c, <8x i8> %d ) {
; CHECK-LABEL: test_select_cc_v8i8_i8:
-; CHECK: and w0, w0, #0xff
-; CHECK-NEXT: cmp w0, w1, uxtb
-; CHECK-NEXT: csinv w0, wzr, wzr, ne
-; CHECK-NEXT: dup v{{[0-9]+}}.8b, w0
-; CHECK-NEXT: bsl v{{[0-9]+}}.8b, v0.8b, v1.8b
+; CHECK-DAG: fmov s[[LHS:[0-9]+]], w0
+; CHECK-DAG: fmov s[[RHS:[0-9]+]], w1
+; CHECK: cmeq [[MASK:v[0-9]+]].8b, v[[LHS]].8b, v[[RHS]].8b
+; CHECK: dup [[DUPMASK:v[0-9]+]].8b, [[MASK]].b[0]
+; CHECK: bsl [[DUPMASK]].8b, v0.8b, v1.8b
%cmp31 = icmp eq i8 %a, %b
%e = select i1 %cmp31, <8x i8> %c, <8x i8> %d
ret <8x i8> %e
@@ -14,9 +14,9 @@ define <8x i8> @test_select_cc_v8i8_i8(i8 %a, i8 %b, <8x i8> %c, <8x i8> %d ) {
define <8x i8> @test_select_cc_v8i8_f32(float %a, float %b, <8x i8> %c, <8x i8> %d ) {
; CHECK-LABEL: test_select_cc_v8i8_f32:
-; CHECK: fcmeq v{{[0-9]+}}.4s, v0.4s, v1.4s
-; CHECK-NEXT: dup v{{[0-9]+}}.2s, v{{[0-9]+}}.s[0]
-; CHECK-NEXT: bsl v{{[0-9]+}}.8b, v2.8b, v3.8b
+; CHECK: fcmeq [[MASK:v[0-9]+]].2s, v0.2s, v1.2s
+; CHECK-NEXT: dup [[DUPMASK:v[0-9]+]].2s, [[MASK]].s[0]
+; CHECK-NEXT: bsl [[DUPMASK]].8b, v2.8b, v3.8b
%cmp31 = fcmp oeq float %a, %b
%e = select i1 %cmp31, <8x i8> %c, <8x i8> %d
ret <8x i8> %e
@@ -24,8 +24,8 @@ define <8x i8> @test_select_cc_v8i8_f32(float %a, float %b, <8x i8> %c, <8x i8>
define <8x i8> @test_select_cc_v8i8_f64(double %a, double %b, <8x i8> %c, <8x i8> %d ) {
; CHECK-LABEL: test_select_cc_v8i8_f64:
-; CHECK: fcmeq v{{[0-9]+}}.2d, v0.2d, v1.2d
-; CHECK-NEXT: bsl v{{[0-9]+}}.8b, v2.8b, v3.8b
+; CHECK: fcmeq d[[MASK:[0-9]+]], d0, d1
+; CHECK-NEXT: bsl v[[MASK]].8b, v2.8b, v3.8b
%cmp31 = fcmp oeq double %a, %b
%e = select i1 %cmp31, <8x i8> %c, <8x i8> %d
ret <8x i8> %e
@@ -33,11 +33,11 @@ define <8x i8> @test_select_cc_v8i8_f64(double %a, double %b, <8x i8> %c, <8x i8
define <16x i8> @test_select_cc_v16i8_i8(i8 %a, i8 %b, <16x i8> %c, <16x i8> %d ) {
; CHECK-LABEL: test_select_cc_v16i8_i8:
-; CHECK: and w0, w0, #0xff
-; CHECK-NEXT: cmp w0, w1, uxtb
-; CHECK-NEXT: csinv w0, wzr, wzr, ne
-; CHECK-NEXT: dup v{{[0-9]+}}.16b, w0
-; CHECK-NEXT: bsl v{{[0-9]+}}.16b, v0.16b, v1.16b
+; CHECK-DAG: fmov s[[LHS:[0-9]+]], w0
+; CHECK-DAG: fmov s[[RHS:[0-9]+]], w1
+; CHECK: cmeq [[MASK:v[0-9]+]].16b, v[[LHS]].16b, v[[RHS]].16b
+; CHECK: dup [[DUPMASK:v[0-9]+]].16b, [[MASK]].b[0]
+; CHECK: bsl [[DUPMASK]].16b, v0.16b, v1.16b
%cmp31 = icmp eq i8 %a, %b
%e = select i1 %cmp31, <16x i8> %c, <16x i8> %d
ret <16x i8> %e
@@ -45,9 +45,9 @@ define <16x i8> @test_select_cc_v16i8_i8(i8 %a, i8 %b, <16x i8> %c, <16x i8> %d
define <16x i8> @test_select_cc_v16i8_f32(float %a, float %b, <16x i8> %c, <16x i8> %d ) {
; CHECK-LABEL: test_select_cc_v16i8_f32:
-; CHECK: fcmeq v{{[0-9]+}}.4s, v0.4s, v1.4s
-; CHECK-NEXT: dup v{{[0-9]+}}.4s, v{{[0-9]+}}.s[0]
-; CHECK-NEXT: bsl v{{[0-9]+}}.16b, v2.16b, v3.16b
+; CHECK: fcmeq [[MASK:v[0-9]+]].4s, v0.4s, v1.4s
+; CHECK-NEXT: dup [[DUPMASK:v[0-9]+]].4s, [[MASK]].s[0]
+; CHECK-NEXT: bsl [[DUPMASK]].16b, v2.16b, v3.16b
%cmp31 = fcmp oeq float %a, %b
%e = select i1 %cmp31, <16x i8> %c, <16x i8> %d
ret <16x i8> %e
@@ -55,9 +55,9 @@ define <16x i8> @test_select_cc_v16i8_f32(float %a, float %b, <16x i8> %c, <16x
define <16x i8> @test_select_cc_v16i8_f64(double %a, double %b, <16x i8> %c, <16x i8> %d ) {
; CHECK-LABEL: test_select_cc_v16i8_f64:
-; CHECK: fcmeq v{{[0-9]+}}.2d, v0.2d, v1.2d
-; CHECK-NEXT: dup v{{[0-9]+}}.2d, v{{[0-9]+}}.d[0]
-; CHECK-NEXT: bsl v{{[0-9]+}}.16b, v2.16b, v3.16b
+; CHECK: fcmeq [[MASK:v[0-9]+]].2d, v0.2d, v1.2d
+; CHECK-NEXT: dup [[DUPMASK:v[0-9]+]].2d, [[MASK]].d[0]
+; CHECK-NEXT: bsl [[DUPMASK]].16b, v2.16b, v3.16b
%cmp31 = fcmp oeq double %a, %b
%e = select i1 %cmp31, <16x i8> %c, <16x i8> %d
ret <16x i8> %e
@@ -65,11 +65,11 @@ define <16x i8> @test_select_cc_v16i8_f64(double %a, double %b, <16x i8> %c, <16
define <4x i16> @test_select_cc_v4i16(i16 %a, i16 %b, <4x i16> %c, <4x i16> %d ) {
; CHECK-LABEL: test_select_cc_v4i16:
-; CHECK: and w0, w0, #0xffff
-; CHECK-NEXT: cmp w0, w1, uxth
-; CHECK-NEXT: csinv w0, wzr, wzr, ne
-; CHECK-NEXT: dup v{{[0-9]+}}.4h, w0
-; CHECK-NEXT: bsl v{{[0-9]+}}.8b, v0.8b, v1.8b
+; CHECK-DAG: fmov s[[LHS:[0-9]+]], w0
+; CHECK-DAG: fmov s[[RHS:[0-9]+]], w1
+; CHECK: cmeq [[MASK:v[0-9]+]].4h, v[[LHS]].4h, v[[RHS]].4h
+; CHECK: dup [[DUPMASK:v[0-9]+]].4h, [[MASK]].h[0]
+; CHECK: bsl [[DUPMASK]].8b, v0.8b, v1.8b
%cmp31 = icmp eq i16 %a, %b
%e = select i1 %cmp31, <4x i16> %c, <4x i16> %d
ret <4x i16> %e
@@ -77,11 +77,11 @@ define <4x i16> @test_select_cc_v4i16(i16 %a, i16 %b, <4x i16> %c, <4x i16> %d )
define <8x i16> @test_select_cc_v8i16(i16 %a, i16 %b, <8x i16> %c, <8x i16> %d ) {
; CHECK-LABEL: test_select_cc_v8i16:
-; CHECK: and w0, w0, #0xffff
-; CHECK-NEXT: cmp w0, w1, uxth
-; CHECK-NEXT: csinv w0, wzr, wzr, ne
-; CHECK-NEXT: dup v{{[0-9]+}}.8h, w0
-; CHECK-NEXT: bsl v{{[0-9]+}}.16b, v0.16b, v1.16b
+; CHECK-DAG: fmov s[[LHS:[0-9]+]], w0
+; CHECK-DAG: fmov s[[RHS:[0-9]+]], w1
+; CHECK: cmeq [[MASK:v[0-9]+]].8h, v[[LHS]].8h, v[[RHS]].8h
+; CHECK: dup [[DUPMASK:v[0-9]+]].8h, [[MASK]].h[0]
+; CHECK: bsl [[DUPMASK]].16b, v0.16b, v1.16b
%cmp31 = icmp eq i16 %a, %b
%e = select i1 %cmp31, <8x i16> %c, <8x i16> %d
ret <8x i16> %e
@@ -89,10 +89,11 @@ define <8x i16> @test_select_cc_v8i16(i16 %a, i16 %b, <8x i16> %c, <8x i16> %d )
define <2x i32> @test_select_cc_v2i32(i32 %a, i32 %b, <2x i32> %c, <2x i32> %d ) {
; CHECK-LABEL: test_select_cc_v2i32:
-; CHECK: cmp w0, w1, uxtw
-; CHECK-NEXT: csinv w0, wzr, wzr, ne
-; CHECK-NEXT: dup v{{[0-9]+}}.2s, w0
-; CHECK-NEXT: bsl v{{[0-9]+}}.8b, v0.8b, v1.8b
+; CHECK-DAG: fmov s[[LHS:[0-9]+]], w0
+; CHECK-DAG: fmov s[[RHS:[0-9]+]], w1
+; CHECK: cmeq [[MASK:v[0-9]+]].2s, v[[LHS]].2s, v[[RHS]].2s
+; CHECK: dup [[DUPMASK:v[0-9]+]].2s, [[MASK]].s[0]
+; CHECK: bsl [[DUPMASK]].8b, v0.8b, v1.8b
%cmp31 = icmp eq i32 %a, %b
%e = select i1 %cmp31, <2x i32> %c, <2x i32> %d
ret <2x i32> %e
@@ -100,10 +101,11 @@ define <2x i32> @test_select_cc_v2i32(i32 %a, i32 %b, <2x i32> %c, <2x i32> %d )
define <4x i32> @test_select_cc_v4i32(i32 %a, i32 %b, <4x i32> %c, <4x i32> %d ) {
; CHECK-LABEL: test_select_cc_v4i32:
-; CHECK: cmp w0, w1, uxtw
-; CHECK-NEXT: csinv w0, wzr, wzr, ne
-; CHECK-NEXT: dup v{{[0-9]+}}.4s, w0
-; CHECK-NEXT: bsl v{{[0-9]+}}.16b, v0.16b, v1.16b
+; CHECK-DAG: fmov s[[LHS:[0-9]+]], w0
+; CHECK-DAG: fmov s[[RHS:[0-9]+]], w1
+; CHECK: cmeq [[MASK:v[0-9]+]].4s, v[[LHS]].4s, v[[RHS]].4s
+; CHECK: dup [[DUPMASK:v[0-9]+]].4s, [[MASK]].s[0]
+; CHECK: bsl [[DUPMASK]].16b, v0.16b, v1.16b
%cmp31 = icmp eq i32 %a, %b
%e = select i1 %cmp31, <4x i32> %c, <4x i32> %d
ret <4x i32> %e
@@ -111,10 +113,10 @@ define <4x i32> @test_select_cc_v4i32(i32 %a, i32 %b, <4x i32> %c, <4x i32> %d )
define <1x i64> @test_select_cc_v1i64(i64 %a, i64 %b, <1x i64> %c, <1x i64> %d ) {
; CHECK-LABEL: test_select_cc_v1i64:
-; CHECK: cmp x0, x1
-; CHECK-NEXT: csinv x0, xzr, xzr, ne
-; CHECK-NEXT: fmov d{{[0-9]+}}, x0
-; CHECK-NEXT: bsl v{{[0-9]+}}.8b, v0.8b, v1.8b
+; CHECK-DAG: fmov d[[LHS:[0-9]+]], x0
+; CHECK-DAG: fmov d[[RHS:[0-9]+]], x1
+; CHECK: cmeq d[[MASK:[0-9]+]], d[[LHS]], d[[RHS]]
+; CHECK: bsl v[[MASK]].8b, v0.8b, v1.8b
%cmp31 = icmp eq i64 %a, %b
%e = select i1 %cmp31, <1x i64> %c, <1x i64> %d
ret <1x i64> %e
@@ -122,10 +124,11 @@ define <1x i64> @test_select_cc_v1i64(i64 %a, i64 %b, <1x i64> %c, <1x i64> %d )
define <2x i64> @test_select_cc_v2i64(i64 %a, i64 %b, <2x i64> %c, <2x i64> %d ) {
; CHECK-LABEL: test_select_cc_v2i64:
-; CHECK: cmp x0, x1
-; CHECK-NEXT: csinv x0, xzr, xzr, ne
-; CHECK-NEXT: dup v{{[0-9]+}}.2d, x0
-; CHECK-NEXT: bsl v{{[0-9]+}}.16b, v0.16b, v1.16b
+; CHECK-DAG: fmov d[[LHS:[0-9]+]], x0
+; CHECK-DAG: fmov d[[RHS:[0-9]+]], x1
+; CHECK: cmeq [[MASK:v[0-9]+]].2d, v[[LHS]].2d, v[[RHS]].2d
+; CHECK: dup [[DUPMASK:v[0-9]+]].2d, [[MASK]].d[0]
+; CHECK: bsl [[DUPMASK]].16b, v0.16b, v1.16b
%cmp31 = icmp eq i64 %a, %b
%e = select i1 %cmp31, <2x i64> %c, <2x i64> %d
ret <2x i64> %e
@@ -133,18 +136,18 @@ define <2x i64> @test_select_cc_v2i64(i64 %a, i64 %b, <2x i64> %c, <2x i64> %d )
define <1 x float> @test_select_cc_v1f32(float %a, float %b, <1 x float> %c, <1 x float> %d ) {
; CHECK-LABEL: test_select_cc_v1f32:
-; CHECK: fcmp s0, s1
-; CHECK-NEXT: fcsel s0, s2, s3, eq
+; CHECK: fcmp s0, s1
+; CHECK-NEXT: fcsel s0, s2, s3, eq
%cmp31 = fcmp oeq float %a, %b
%e = select i1 %cmp31, <1 x float> %c, <1 x float> %d
ret <1 x float> %e
}
-
+
define <2 x float> @test_select_cc_v2f32(float %a, float %b, <2 x float> %c, <2 x float> %d ) {
; CHECK-LABEL: test_select_cc_v2f32:
-; CHECK: fcmeq v{{[0-9]+}}.4s, v0.4s, v1.4s
-; CHECK-NEXT: dup v{{[0-9]+}}.2s, v{{[0-9]+}}.s[0]
-; CHECK-NEXT: bsl v{{[0-9]+}}.8b, v2.8b, v3.8b
+; CHECK: fcmeq [[MASK:v[0-9]+]].2s, v0.2s, v1.2s
+; CHECK: dup [[DUPMASK:v[0-9]+]].2s, [[MASK]].s[0]
+; CHECK: bsl [[DUPMASK]].8b, v2.8b, v3.8b
%cmp31 = fcmp oeq float %a, %b
%e = select i1 %cmp31, <2 x float> %c, <2 x float> %d
ret <2 x float> %e
@@ -152,9 +155,9 @@ define <2 x float> @test_select_cc_v2f32(float %a, float %b, <2 x float> %c, <2
define <4x float> @test_select_cc_v4f32(float %a, float %b, <4x float> %c, <4x float> %d ) {
; CHECK-LABEL: test_select_cc_v4f32:
-; CHECK: fcmeq v{{[0-9]+}}.4s, v0.4s, v1.4s
-; CHECK-NEXT: dup v{{[0-9]+}}.4s, v{{[0-9]+}}.s[0]
-; CHECK-NEXT: bsl v{{[0-9]+}}.16b, v2.16b, v3.16b
+; CHECK: fcmeq [[MASK:v[0-9]+]].4s, v0.4s, v1.4s
+; CHECK: dup [[DUPMASK:v[0-9]+]].4s, [[MASK]].s[0]
+; CHECK: bsl [[DUPMASK]].16b, v2.16b, v3.16b
%cmp31 = fcmp oeq float %a, %b
%e = select i1 %cmp31, <4x float> %c, <4x float> %d
ret <4x float> %e
@@ -162,10 +165,11 @@ define <4x float> @test_select_cc_v4f32(float %a, float %b, <4x float> %c, <4x f
define <4x float> @test_select_cc_v4f32_icmp(i32 %a, i32 %b, <4x float> %c, <4x float> %d ) {
; CHECK-LABEL: test_select_cc_v4f32_icmp:
-; CHECK: cmp w0, w1, uxtw
-; CHECK: csinv w0, wzr, wzr, ne
-; CHECK-NEXT: dup v{{[0-9]+}}.4s, w0
-; CHECK-NEXT: bsl v{{[0-9]+}}.16b, v0.16b, v1.16b
+; CHECK-DAG: fmov s[[LHS:[0-9]+]], w0
+; CHECK-DAG: fmov s[[RHS:[0-9]+]], w1
+; CHECK: cmeq [[MASK:v[0-9]+]].4s, v[[LHS]].4s, v[[RHS]].4s
+; CHECK: dup [[DUPMASK:v[0-9]+]].4s, [[MASK]].s[0]
+; CHECK: bsl [[DUPMASK]].16b, v0.16b, v1.16b
%cmp31 = icmp eq i32 %a, %b
%e = select i1 %cmp31, <4x float> %c, <4x float> %d
ret <4x float> %e
@@ -173,8 +177,8 @@ define <4x float> @test_select_cc_v4f32_icmp(i32 %a, i32 %b, <4x float> %c, <4x
define <1 x double> @test_select_cc_v1f64(double %a, double %b, <1 x double> %c, <1 x double> %d ) {
; CHECK-LABEL: test_select_cc_v1f64:
-; CHECK: fcmeq v{{[0-9]+}}.2d, v0.2d, v1.2d
-; CHECK-NEXT: bsl v{{[0-9]+}}.8b, v2.8b, v3.8b
+; CHECK: fcmeq d[[MASK:[0-9]+]], d0, d1
+; CHECK: bsl v[[MASK]].8b, v2.8b, v3.8b
%cmp31 = fcmp oeq double %a, %b
%e = select i1 %cmp31, <1 x double> %c, <1 x double> %d
ret <1 x double> %e
@@ -182,10 +186,10 @@ define <1 x double> @test_select_cc_v1f64(double %a, double %b, <1 x double> %c,
define <1 x double> @test_select_cc_v1f64_icmp(i64 %a, i64 %b, <1 x double> %c, <1 x double> %d ) {
; CHECK-LABEL: test_select_cc_v1f64_icmp:
-; CHECK: cmp x0, x1
-; CHECK-NEXT: csinv x0, xzr, xzr, ne
-; CHECK-NEXT: fmov d{{[0-9]+}}, x0
-; CHECK-NEXT: bsl v{{[0-9]+}}.8b, v0.8b, v1.8b
+; CHECK-DAG: fmov [[LHS:d[0-9]+]], x0
+; CHECK-DAG: fmov [[RHS:d[0-9]+]], x1
+; CHECK: cmeq d[[MASK:[0-9]+]], [[LHS]], [[RHS]]
+; CHECK: bsl v[[MASK]].8b, v0.8b, v1.8b
%cmp31 = icmp eq i64 %a, %b
%e = select i1 %cmp31, <1 x double> %c, <1 x double> %d
ret <1 x double> %e
@@ -193,9 +197,9 @@ define <1 x double> @test_select_cc_v1f64_icmp(i64 %a, i64 %b, <1 x double> %c,
define <2 x double> @test_select_cc_v2f64(double %a, double %b, <2 x double> %c, <2 x double> %d ) {
; CHECK-LABEL: test_select_cc_v2f64:
-; CHECK: fcmeq v{{[0-9]+}}.2d, v0.2d, v1.2d
-; CHECK-NEXT: dup v{{[0-9]+}}.2d, v{{[0-9]+}}.d[0]
-; CHECK-NEXT: bsl v{{[0-9]+}}.16b, v2.16b, v3.16b
+; CHECK: fcmeq [[MASK:v[0-9]+]].2d, v0.2d, v1.2d
+; CHECK: dup [[DUPMASK:v[0-9]+]].2d, [[MASK]].d[0]
+; CHECK: bsl [[DUPMASK]].16b, v2.16b, v3.16b
%cmp31 = fcmp oeq double %a, %b
%e = select i1 %cmp31, <2 x double> %c, <2 x double> %d
ret <2 x double> %e
diff --git a/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll b/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll
new file mode 100644
index 0000000..cca6bfe
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll
@@ -0,0 +1,482 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+
+%struct.uint8x16x2_t = type { [2 x <16 x i8>] }
+%struct.poly8x16x2_t = type { [2 x <16 x i8>] }
+%struct.uint8x16x3_t = type { [3 x <16 x i8>] }
+%struct.int8x16x2_t = type { [2 x <16 x i8>] }
+%struct.int16x8x2_t = type { [2 x <8 x i16>] }
+%struct.int32x4x2_t = type { [2 x <4 x i32>] }
+%struct.int64x2x2_t = type { [2 x <2 x i64>] }
+%struct.float32x4x2_t = type { [2 x <4 x float>] }
+%struct.float64x2x2_t = type { [2 x <2 x double>] }
+%struct.int8x8x2_t = type { [2 x <8 x i8>] }
+%struct.int16x4x2_t = type { [2 x <4 x i16>] }
+%struct.int32x2x2_t = type { [2 x <2 x i32>] }
+%struct.int64x1x2_t = type { [2 x <1 x i64>] }
+%struct.float32x2x2_t = type { [2 x <2 x float>] }
+%struct.float64x1x2_t = type { [2 x <1 x double>] }
+%struct.int8x16x3_t = type { [3 x <16 x i8>] }
+%struct.int16x8x3_t = type { [3 x <8 x i16>] }
+%struct.int32x4x3_t = type { [3 x <4 x i32>] }
+%struct.int64x2x3_t = type { [3 x <2 x i64>] }
+%struct.float32x4x3_t = type { [3 x <4 x float>] }
+%struct.float64x2x3_t = type { [3 x <2 x double>] }
+%struct.int8x8x3_t = type { [3 x <8 x i8>] }
+%struct.int16x4x3_t = type { [3 x <4 x i16>] }
+%struct.int32x2x3_t = type { [3 x <2 x i32>] }
+%struct.int64x1x3_t = type { [3 x <1 x i64>] }
+%struct.float32x2x3_t = type { [3 x <2 x float>] }
+%struct.float64x1x3_t = type { [3 x <1 x double>] }
+%struct.int8x16x4_t = type { [4 x <16 x i8>] }
+%struct.int16x8x4_t = type { [4 x <8 x i16>] }
+%struct.int32x4x4_t = type { [4 x <4 x i32>] }
+%struct.int64x2x4_t = type { [4 x <2 x i64>] }
+%struct.float32x4x4_t = type { [4 x <4 x float>] }
+%struct.float64x2x4_t = type { [4 x <2 x double>] }
+%struct.int8x8x4_t = type { [4 x <8 x i8>] }
+%struct.int16x4x4_t = type { [4 x <4 x i16>] }
+%struct.int32x2x4_t = type { [4 x <2 x i32>] }
+%struct.int64x1x4_t = type { [4 x <1 x i64>] }
+%struct.float32x2x4_t = type { [4 x <2 x float>] }
+%struct.float64x1x4_t = type { [4 x <1 x double>] }
+
+define <16 x i8> @test_ld_from_poll_v16i8(<16 x i8> %a) {
+; CHECK-LABEL: test_ld_from_poll_v16i8:
+; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.{{[A-Z0-9_]+}}]
+entry:
+ %b = add <16 x i8> %a, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 2, i8 13, i8 14, i8 15, i8 16>
+ ret <16 x i8> %b
+}
+
+define <8 x i16> @test_ld_from_poll_v8i16(<8 x i16> %a) {
+; CHECK-LABEL: test_ld_from_poll_v8i16:
+; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.{{[A-Z0-9_]+}}]
+entry:
+ %b = add <8 x i16> %a, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
+ ret <8 x i16> %b
+}
+
+define <4 x i32> @test_ld_from_poll_v4i32(<4 x i32> %a) {
+; CHECK-LABEL: test_ld_from_poll_v4i32:
+; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.{{[A-Z0-9_]+}}]
+entry:
+ %b = add <4 x i32> %a, <i32 1, i32 2, i32 3, i32 4>
+ ret <4 x i32> %b
+}
+
+define <2 x i64> @test_ld_from_poll_v2i64(<2 x i64> %a) {
+; CHECK-LABEL: test_ld_from_poll_v2i64:
+; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.{{[A-Z0-9_]+}}]
+entry:
+ %b = add <2 x i64> %a, <i64 1, i64 2>
+ ret <2 x i64> %b
+}
+
+define <4 x float> @test_ld_from_poll_v4f32(<4 x float> %a) {
+; CHECK-LABEL: test_ld_from_poll_v4f32:
+; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.{{[A-Z0-9_]+}}]
+entry:
+ %b = fadd <4 x float> %a, <float 1.0, float 2.0, float 3.0, float 4.0>
+ ret <4 x float> %b
+}
+
+define <2 x double> @test_ld_from_poll_v2f64(<2 x double> %a) {
+; CHECK-LABEL: test_ld_from_poll_v2f64:
+; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.{{[A-Z0-9_]+}}]
+entry:
+ %b = fadd <2 x double> %a, <double 1.0, double 2.0>
+ ret <2 x double> %b
+}
+
+define <8 x i8> @test_ld_from_poll_v8i8(<8 x i8> %a) {
+; CHECK-LABEL: test_ld_from_poll_v8i8:
+; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
+; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.{{[A-Z0-9_]+}}]
+entry:
+ %b = add <8 x i8> %a, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>
+ ret <8 x i8> %b
+}
+
+define <4 x i16> @test_ld_from_poll_v4i16(<4 x i16> %a) {
+; CHECK-LABEL: test_ld_from_poll_v4i16:
+; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
+; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.{{[A-Z0-9_]+}}]
+entry:
+ %b = add <4 x i16> %a, <i16 1, i16 2, i16 3, i16 4>
+ ret <4 x i16> %b
+}
+
+define <2 x i32> @test_ld_from_poll_v2i32(<2 x i32> %a) {
+; CHECK-LABEL: test_ld_from_poll_v2i32:
+; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
+; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.{{[A-Z0-9_]+}}]
+entry:
+ %b = add <2 x i32> %a, <i32 1, i32 2>
+ ret <2 x i32> %b
+}
+
+define <16 x i8> @test_vld1q_dup_s8(i8* %a) {
+; CHECK-LABEL: test_vld1q_dup_s8:
+; CHECK: ld1r {{{ ?v[0-9]+.16b ?}}}, [x0]
+entry:
+ %0 = load i8* %a, align 1
+ %1 = insertelement <16 x i8> undef, i8 %0, i32 0
+ %lane = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
+ ret <16 x i8> %lane
+}
+
+define <8 x i16> @test_vld1q_dup_s16(i16* %a) {
+; CHECK-LABEL: test_vld1q_dup_s16:
+; CHECK: ld1r {{{ ?v[0-9]+.8h ?}}}, [x0]
+entry:
+ %0 = load i16* %a, align 2
+ %1 = insertelement <8 x i16> undef, i16 %0, i32 0
+ %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
+ ret <8 x i16> %lane
+}
+
+define <4 x i32> @test_vld1q_dup_s32(i32* %a) {
+; CHECK-LABEL: test_vld1q_dup_s32:
+; CHECK: ld1r {{{ ?v[0-9]+.4s ?}}}, [x0]
+entry:
+ %0 = load i32* %a, align 4
+ %1 = insertelement <4 x i32> undef, i32 %0, i32 0
+ %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
+ ret <4 x i32> %lane
+}
+
+define <2 x i64> @test_vld1q_dup_s64(i64* %a) {
+; CHECK-LABEL: test_vld1q_dup_s64:
+; CHECK: ld1r {{{ ?v[0-9]+.2d ?}}}, [x0]
+entry:
+ %0 = load i64* %a, align 8
+ %1 = insertelement <2 x i64> undef, i64 %0, i32 0
+ %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
+ ret <2 x i64> %lane
+}
+
+define <4 x float> @test_vld1q_dup_f32(float* %a) {
+; CHECK-LABEL: test_vld1q_dup_f32:
+; CHECK: ld1r {{{ ?v[0-9]+.4s ?}}}, [x0]
+entry:
+ %0 = load float* %a, align 4
+ %1 = insertelement <4 x float> undef, float %0, i32 0
+ %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
+ ret <4 x float> %lane
+}
+
+define <2 x double> @test_vld1q_dup_f64(double* %a) {
+; CHECK-LABEL: test_vld1q_dup_f64:
+; CHECK: ld1r {{{ ?v[0-9]+.2d ?}}}, [x0]
+entry:
+ %0 = load double* %a, align 8
+ %1 = insertelement <2 x double> undef, double %0, i32 0
+ %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
+ ret <2 x double> %lane
+}
+
+define <8 x i8> @test_vld1_dup_s8(i8* %a) {
+; CHECK-LABEL: test_vld1_dup_s8:
+; CHECK: ld1r {{{ ?v[0-9]+.8b ?}}}, [x0]
+entry:
+ %0 = load i8* %a, align 1
+ %1 = insertelement <8 x i8> undef, i8 %0, i32 0
+ %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
+ ret <8 x i8> %lane
+}
+
+define <4 x i16> @test_vld1_dup_s16(i16* %a) {
+; CHECK-LABEL: test_vld1_dup_s16:
+; CHECK: ld1r {{{ ?v[0-9]+.4h ?}}}, [x0]
+entry:
+ %0 = load i16* %a, align 2
+ %1 = insertelement <4 x i16> undef, i16 %0, i32 0
+ %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
+ ret <4 x i16> %lane
+}
+
+define <2 x i32> @test_vld1_dup_s32(i32* %a) {
+; CHECK-LABEL: test_vld1_dup_s32:
+; CHECK: ld1r {{{ ?v[0-9]+.2s ?}}}, [x0]
+entry:
+ %0 = load i32* %a, align 4
+ %1 = insertelement <2 x i32> undef, i32 %0, i32 0
+ %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
+ ret <2 x i32> %lane
+}
+
+define <1 x i64> @test_vld1_dup_s64(i64* %a) {
+; CHECK-LABEL: test_vld1_dup_s64:
+; CHECK: ldr {{d[0-9]+}}, [x0]
+entry:
+ %0 = load i64* %a, align 8
+ %1 = insertelement <1 x i64> undef, i64 %0, i32 0
+ ret <1 x i64> %1
+}
+
+define <2 x float> @test_vld1_dup_f32(float* %a) {
+; CHECK-LABEL: test_vld1_dup_f32:
+; CHECK: ld1r {{{ ?v[0-9]+.2s ?}}}, [x0]
+entry:
+ %0 = load float* %a, align 4
+ %1 = insertelement <2 x float> undef, float %0, i32 0
+ %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
+ ret <2 x float> %lane
+}
+
+define <1 x double> @test_vld1_dup_f64(double* %a) {
+; CHECK-LABEL: test_vld1_dup_f64:
+; CHECK: ldr {{d[0-9]+}}, [x0]
+entry:
+ %0 = load double* %a, align 8
+ %1 = insertelement <1 x double> undef, double %0, i32 0
+ ret <1 x double> %1
+}
+
+define <1 x i64> @testDUP.v1i64(i64* %a, i64* %b) #0 {
+; As there is a store operation depending on %1, LD1R pattern can't be selected.
+; So LDR and FMOV should be emitted.
+; CHECK-LABEL: testDUP.v1i64:
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}]
+; CHECK-DAG: fmov {{d[0-9]+}}, {{x[0-9]+}}
+; CHECK-DAG: str {{x[0-9]+}}, [{{x[0-9]+}}]
+ %1 = load i64* %a, align 8
+ store i64 %1, i64* %b, align 8
+ %vecinit.i = insertelement <1 x i64> undef, i64 %1, i32 0
+ ret <1 x i64> %vecinit.i
+}
+
+define <1 x double> @testDUP.v1f64(double* %a, double* %b) #0 {
+; As there is a store operation depending on %1, LD1R pattern can't be selected.
+; So LDR and FMOV should be emitted.
+; CHECK-LABEL: testDUP.v1f64:
+; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}]
+; CHECK: str {{d[0-9]+}}, [{{x[0-9]+}}]
+ %1 = load double* %a, align 8
+ store double %1, double* %b, align 8
+ %vecinit.i = insertelement <1 x double> undef, double %1, i32 0
+ ret <1 x double> %vecinit.i
+}
+
+define <16 x i8> @test_vld1q_lane_s8(i8* %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vld1q_lane_s8:
+; CHECK: ld1 { {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
+entry:
+ %0 = load i8* %a, align 1
+ %vld1_lane = insertelement <16 x i8> %b, i8 %0, i32 15
+ ret <16 x i8> %vld1_lane
+}
+
+define <8 x i16> @test_vld1q_lane_s16(i16* %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vld1q_lane_s16:
+; CHECK: ld1 { {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
+entry:
+ %0 = load i16* %a, align 2
+ %vld1_lane = insertelement <8 x i16> %b, i16 %0, i32 7
+ ret <8 x i16> %vld1_lane
+}
+
+define <4 x i32> @test_vld1q_lane_s32(i32* %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vld1q_lane_s32:
+; CHECK: ld1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
+entry:
+ %0 = load i32* %a, align 4
+ %vld1_lane = insertelement <4 x i32> %b, i32 %0, i32 3
+ ret <4 x i32> %vld1_lane
+}
+
+define <2 x i64> @test_vld1q_lane_s64(i64* %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vld1q_lane_s64:
+; CHECK: ld1 { {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
+entry:
+ %0 = load i64* %a, align 8
+ %vld1_lane = insertelement <2 x i64> %b, i64 %0, i32 1
+ ret <2 x i64> %vld1_lane
+}
+
+define <4 x float> @test_vld1q_lane_f32(float* %a, <4 x float> %b) {
+; CHECK-LABEL: test_vld1q_lane_f32:
+; CHECK: ld1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
+entry:
+ %0 = load float* %a, align 4
+ %vld1_lane = insertelement <4 x float> %b, float %0, i32 3
+ ret <4 x float> %vld1_lane
+}
+
+define <2 x double> @test_vld1q_lane_f64(double* %a, <2 x double> %b) {
+; CHECK-LABEL: test_vld1q_lane_f64:
+; CHECK: ld1 { {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
+entry:
+ %0 = load double* %a, align 8
+ %vld1_lane = insertelement <2 x double> %b, double %0, i32 1
+ ret <2 x double> %vld1_lane
+}
+
+define <8 x i8> @test_vld1_lane_s8(i8* %a, <8 x i8> %b) {
+; CHECK-LABEL: test_vld1_lane_s8:
+; CHECK: ld1 { {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
+entry:
+ %0 = load i8* %a, align 1
+ %vld1_lane = insertelement <8 x i8> %b, i8 %0, i32 7
+ ret <8 x i8> %vld1_lane
+}
+
+define <4 x i16> @test_vld1_lane_s16(i16* %a, <4 x i16> %b) {
+; CHECK-LABEL: test_vld1_lane_s16:
+; CHECK: ld1 { {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
+entry:
+ %0 = load i16* %a, align 2
+ %vld1_lane = insertelement <4 x i16> %b, i16 %0, i32 3
+ ret <4 x i16> %vld1_lane
+}
+
+define <2 x i32> @test_vld1_lane_s32(i32* %a, <2 x i32> %b) {
+; CHECK-LABEL: test_vld1_lane_s32:
+; CHECK: ld1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
+entry:
+ %0 = load i32* %a, align 4
+ %vld1_lane = insertelement <2 x i32> %b, i32 %0, i32 1
+ ret <2 x i32> %vld1_lane
+}
+
+define <1 x i64> @test_vld1_lane_s64(i64* %a, <1 x i64> %b) {
+; CHECK-LABEL: test_vld1_lane_s64:
+; CHECK: ldr {{d[0-9]+}}, [x0]
+entry:
+ %0 = load i64* %a, align 8
+ %vld1_lane = insertelement <1 x i64> undef, i64 %0, i32 0
+ ret <1 x i64> %vld1_lane
+}
+
+define <2 x float> @test_vld1_lane_f32(float* %a, <2 x float> %b) {
+; CHECK-LABEL: test_vld1_lane_f32:
+; CHECK: ld1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
+entry:
+ %0 = load float* %a, align 4
+ %vld1_lane = insertelement <2 x float> %b, float %0, i32 1
+ ret <2 x float> %vld1_lane
+}
+
+define <1 x double> @test_vld1_lane_f64(double* %a, <1 x double> %b) {
+; CHECK-LABEL: test_vld1_lane_f64:
+; CHECK: ldr {{d[0-9]+}}, [x0]
+entry:
+ %0 = load double* %a, align 8
+ %vld1_lane = insertelement <1 x double> undef, double %0, i32 0
+ ret <1 x double> %vld1_lane
+}
+
+define void @test_vst1q_lane_s8(i8* %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vst1q_lane_s8:
+; CHECK: st1 { {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
+entry:
+ %0 = extractelement <16 x i8> %b, i32 15
+ store i8 %0, i8* %a, align 1
+ ret void
+}
+
+define void @test_vst1q_lane_s16(i16* %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vst1q_lane_s16:
+; CHECK: st1 { {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
+entry:
+ %0 = extractelement <8 x i16> %b, i32 7
+ store i16 %0, i16* %a, align 2
+ ret void
+}
+
+define void @test_vst1q_lane_s32(i32* %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vst1q_lane_s32:
+; CHECK: st1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
+entry:
+ %0 = extractelement <4 x i32> %b, i32 3
+ store i32 %0, i32* %a, align 4
+ ret void
+}
+
+define void @test_vst1q_lane_s64(i64* %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vst1q_lane_s64:
+; CHECK: st1 { {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
+entry:
+ %0 = extractelement <2 x i64> %b, i32 1
+ store i64 %0, i64* %a, align 8
+ ret void
+}
+
+define void @test_vst1q_lane_f32(float* %a, <4 x float> %b) {
+; CHECK-LABEL: test_vst1q_lane_f32:
+; CHECK: st1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
+entry:
+ %0 = extractelement <4 x float> %b, i32 3
+ store float %0, float* %a, align 4
+ ret void
+}
+
+define void @test_vst1q_lane_f64(double* %a, <2 x double> %b) {
+; CHECK-LABEL: test_vst1q_lane_f64:
+; CHECK: st1 { {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
+entry:
+ %0 = extractelement <2 x double> %b, i32 1
+ store double %0, double* %a, align 8
+ ret void
+}
+
+define void @test_vst1_lane_s8(i8* %a, <8 x i8> %b) {
+; CHECK-LABEL: test_vst1_lane_s8:
+; CHECK: st1 { {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
+entry:
+ %0 = extractelement <8 x i8> %b, i32 7
+ store i8 %0, i8* %a, align 1
+ ret void
+}
+
+define void @test_vst1_lane_s16(i16* %a, <4 x i16> %b) {
+; CHECK-LABEL: test_vst1_lane_s16:
+; CHECK: st1 { {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
+entry:
+ %0 = extractelement <4 x i16> %b, i32 3
+ store i16 %0, i16* %a, align 2
+ ret void
+}
+
+define void @test_vst1_lane_s32(i32* %a, <2 x i32> %b) {
+; CHECK-LABEL: test_vst1_lane_s32:
+; CHECK: st1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
+entry:
+ %0 = extractelement <2 x i32> %b, i32 1
+ store i32 %0, i32* %a, align 4
+ ret void
+}
+
+define void @test_vst1_lane_s64(i64* %a, <1 x i64> %b) {
+; CHECK-LABEL: test_vst1_lane_s64:
+; CHECK: st1 { {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
+entry:
+ %0 = extractelement <1 x i64> %b, i32 0
+ store i64 %0, i64* %a, align 8
+ ret void
+}
+
+define void @test_vst1_lane_f32(float* %a, <2 x float> %b) {
+; CHECK-LABEL: test_vst1_lane_f32:
+; CHECK: st1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
+entry:
+ %0 = extractelement <2 x float> %b, i32 1
+ store float %0, float* %a, align 4
+ ret void
+}
+
+define void @test_vst1_lane_f64(double* %a, <1 x double> %b) {
+; CHECK-LABEL: test_vst1_lane_f64:
+; CHECK: str {{d[0-9]+}}, [x0]
+entry:
+ %0 = extractelement <1 x double> %b, i32 0
+ store double %0, double* %a, align 8
+ ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-neon-simd-shift.ll b/test/CodeGen/AArch64/arm64-neon-simd-shift.ll
new file mode 100644
index 0000000..447fb63
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-simd-shift.ll
@@ -0,0 +1,663 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+define <8 x i8> @test_vshr_n_s8(<8 x i8> %a) {
+; CHECK: test_vshr_n_s8
+; CHECK: sshr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
+ %vshr_n = ashr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+ ret <8 x i8> %vshr_n
+}
+
+define <4 x i16> @test_vshr_n_s16(<4 x i16> %a) {
+; CHECK: test_vshr_n_s16
+; CHECK: sshr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
+ %vshr_n = ashr <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3>
+ ret <4 x i16> %vshr_n
+}
+
+define <2 x i32> @test_vshr_n_s32(<2 x i32> %a) {
+; CHECK: test_vshr_n_s32
+; CHECK: sshr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
+ %vshr_n = ashr <2 x i32> %a, <i32 3, i32 3>
+ ret <2 x i32> %vshr_n
+}
+
+define <16 x i8> @test_vshrq_n_s8(<16 x i8> %a) {
+; CHECK: test_vshrq_n_s8
+; CHECK: sshr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
+ %vshr_n = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+ ret <16 x i8> %vshr_n
+}
+
+define <8 x i16> @test_vshrq_n_s16(<8 x i16> %a) {
+; CHECK: test_vshrq_n_s16
+; CHECK: sshr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
+ %vshr_n = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+ ret <8 x i16> %vshr_n
+}
+
+define <4 x i32> @test_vshrq_n_s32(<4 x i32> %a) {
+; CHECK: test_vshrq_n_s32
+; CHECK: sshr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
+ %vshr_n = ashr <4 x i32> %a, <i32 3, i32 3, i32 3, i32 3>
+ ret <4 x i32> %vshr_n
+}
+
+define <2 x i64> @test_vshrq_n_s64(<2 x i64> %a) {
+; CHECK: test_vshrq_n_s64
+; CHECK: sshr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
+ %vshr_n = ashr <2 x i64> %a, <i64 3, i64 3>
+ ret <2 x i64> %vshr_n
+}
+
+define <8 x i8> @test_vshr_n_u8(<8 x i8> %a) {
+; CHECK: test_vshr_n_u8
+; CHECK: ushr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
+ %vshr_n = lshr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+ ret <8 x i8> %vshr_n
+}
+
+define <4 x i16> @test_vshr_n_u16(<4 x i16> %a) {
+; CHECK: test_vshr_n_u16
+; CHECK: ushr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
+ %vshr_n = lshr <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3>
+ ret <4 x i16> %vshr_n
+}
+
+define <2 x i32> @test_vshr_n_u32(<2 x i32> %a) {
+; CHECK: test_vshr_n_u32
+; CHECK: ushr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
+ %vshr_n = lshr <2 x i32> %a, <i32 3, i32 3>
+ ret <2 x i32> %vshr_n
+}
+
+define <16 x i8> @test_vshrq_n_u8(<16 x i8> %a) {
+; CHECK: test_vshrq_n_u8
+; CHECK: ushr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
+ %vshr_n = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+ ret <16 x i8> %vshr_n
+}
+
+define <8 x i16> @test_vshrq_n_u16(<8 x i16> %a) {
+; CHECK: test_vshrq_n_u16
+; CHECK: ushr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
+ %vshr_n = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+ ret <8 x i16> %vshr_n
+}
+
+define <4 x i32> @test_vshrq_n_u32(<4 x i32> %a) {
+; CHECK: test_vshrq_n_u32
+; CHECK: ushr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
+ %vshr_n = lshr <4 x i32> %a, <i32 3, i32 3, i32 3, i32 3>
+ ret <4 x i32> %vshr_n
+}
+
+define <2 x i64> @test_vshrq_n_u64(<2 x i64> %a) {
+; CHECK: test_vshrq_n_u64
+; CHECK: ushr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
+ %vshr_n = lshr <2 x i64> %a, <i64 3, i64 3>
+ ret <2 x i64> %vshr_n
+}
+
+define <8 x i8> @test_vsra_n_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vsra_n_s8
+; CHECK: ssra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
+ %vsra_n = ashr <8 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+ %1 = add <8 x i8> %vsra_n, %a
+ ret <8 x i8> %1
+}
+
+define <4 x i16> @test_vsra_n_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vsra_n_s16
+; CHECK: ssra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
+ %vsra_n = ashr <4 x i16> %b, <i16 3, i16 3, i16 3, i16 3>
+ %1 = add <4 x i16> %vsra_n, %a
+ ret <4 x i16> %1
+}
+
+define <2 x i32> @test_vsra_n_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vsra_n_s32
+; CHECK: ssra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
+ %vsra_n = ashr <2 x i32> %b, <i32 3, i32 3>
+ %1 = add <2 x i32> %vsra_n, %a
+ ret <2 x i32> %1
+}
+
+define <16 x i8> @test_vsraq_n_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vsraq_n_s8
+; CHECK: ssra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
+ %vsra_n = ashr <16 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+ %1 = add <16 x i8> %vsra_n, %a
+ ret <16 x i8> %1
+}
+
+define <8 x i16> @test_vsraq_n_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vsraq_n_s16
+; CHECK: ssra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
+ %vsra_n = ashr <8 x i16> %b, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+ %1 = add <8 x i16> %vsra_n, %a
+ ret <8 x i16> %1
+}
+
+define <4 x i32> @test_vsraq_n_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vsraq_n_s32
+; CHECK: ssra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
+ %vsra_n = ashr <4 x i32> %b, <i32 3, i32 3, i32 3, i32 3>
+ %1 = add <4 x i32> %vsra_n, %a
+ ret <4 x i32> %1
+}
+
+define <2 x i64> @test_vsraq_n_s64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vsraq_n_s64
+; CHECK: ssra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
+ %vsra_n = ashr <2 x i64> %b, <i64 3, i64 3>
+ %1 = add <2 x i64> %vsra_n, %a
+ ret <2 x i64> %1
+}
+
+define <8 x i8> @test_vsra_n_u8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vsra_n_u8
+; CHECK: usra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
+ %vsra_n = lshr <8 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+ %1 = add <8 x i8> %vsra_n, %a
+ ret <8 x i8> %1
+}
+
+define <4 x i16> @test_vsra_n_u16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vsra_n_u16
+; CHECK: usra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
+ %vsra_n = lshr <4 x i16> %b, <i16 3, i16 3, i16 3, i16 3>
+ %1 = add <4 x i16> %vsra_n, %a
+ ret <4 x i16> %1
+}
+
+define <2 x i32> @test_vsra_n_u32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vsra_n_u32
+; CHECK: usra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
+ %vsra_n = lshr <2 x i32> %b, <i32 3, i32 3>
+ %1 = add <2 x i32> %vsra_n, %a
+ ret <2 x i32> %1
+}
+
+define <16 x i8> @test_vsraq_n_u8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vsraq_n_u8
+; CHECK: usra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
+ %vsra_n = lshr <16 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+ %1 = add <16 x i8> %vsra_n, %a
+ ret <16 x i8> %1
+}
+
+define <8 x i16> @test_vsraq_n_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vsraq_n_u16
+; CHECK: usra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
+ %vsra_n = lshr <8 x i16> %b, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+ %1 = add <8 x i16> %vsra_n, %a
+ ret <8 x i16> %1
+}
+
+define <4 x i32> @test_vsraq_n_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vsraq_n_u32
+; CHECK: usra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
+ %vsra_n = lshr <4 x i32> %b, <i32 3, i32 3, i32 3, i32 3>
+ %1 = add <4 x i32> %vsra_n, %a
+ ret <4 x i32> %1
+}
+
+define <2 x i64> @test_vsraq_n_u64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vsraq_n_u64
+; CHECK: usra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
+ %vsra_n = lshr <2 x i64> %b, <i64 3, i64 3>
+ %1 = add <2 x i64> %vsra_n, %a
+ ret <2 x i64> %1
+}
+
+define <8 x i8> @test_vshrn_n_s16(<8 x i16> %a) {
+; CHECK: test_vshrn_n_s16
+; CHECK: shrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
+ %1 = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+ %vshrn_n = trunc <8 x i16> %1 to <8 x i8>
+ ret <8 x i8> %vshrn_n
+}
+
+define <4 x i16> @test_vshrn_n_s32(<4 x i32> %a) {
+; CHECK: test_vshrn_n_s32
+; CHECK: shrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
+ %1 = ashr <4 x i32> %a, <i32 9, i32 9, i32 9, i32 9>
+ %vshrn_n = trunc <4 x i32> %1 to <4 x i16>
+ ret <4 x i16> %vshrn_n
+}
+
+define <2 x i32> @test_vshrn_n_s64(<2 x i64> %a) {
+; CHECK: test_vshrn_n_s64
+; CHECK: shrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
+ %1 = ashr <2 x i64> %a, <i64 19, i64 19>
+ %vshrn_n = trunc <2 x i64> %1 to <2 x i32>
+ ret <2 x i32> %vshrn_n
+}
+
+define <8 x i8> @test_vshrn_n_u16(<8 x i16> %a) {
+; CHECK: test_vshrn_n_u16
+; CHECK: shrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
+ %1 = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+ %vshrn_n = trunc <8 x i16> %1 to <8 x i8>
+ ret <8 x i8> %vshrn_n
+}
+
+define <4 x i16> @test_vshrn_n_u32(<4 x i32> %a) {
+; CHECK: test_vshrn_n_u32
+; CHECK: shrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
+ %1 = lshr <4 x i32> %a, <i32 9, i32 9, i32 9, i32 9>
+ %vshrn_n = trunc <4 x i32> %1 to <4 x i16>
+ ret <4 x i16> %vshrn_n
+}
+
+define <2 x i32> @test_vshrn_n_u64(<2 x i64> %a) {
+; CHECK: test_vshrn_n_u64
+; CHECK: shrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
+ %1 = lshr <2 x i64> %a, <i64 19, i64 19>
+ %vshrn_n = trunc <2 x i64> %1 to <2 x i32>
+ ret <2 x i32> %vshrn_n
+}
+
+define <16 x i8> @test_vshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vshrn_high_n_s16
+; CHECK: shrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+ %1 = ashr <8 x i16> %b, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+ %vshrn_n = trunc <8 x i16> %1 to <8 x i8>
+ %2 = bitcast <8 x i8> %a to <1 x i64>
+ %3 = bitcast <8 x i8> %vshrn_n to <1 x i64>
+ %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
+ %4 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+ ret <16 x i8> %4
+}
+
+define <8 x i16> @test_vshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vshrn_high_n_s32
+; CHECK: shrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+ %1 = ashr <4 x i32> %b, <i32 9, i32 9, i32 9, i32 9>
+ %vshrn_n = trunc <4 x i32> %1 to <4 x i16>
+ %2 = bitcast <4 x i16> %a to <1 x i64>
+ %3 = bitcast <4 x i16> %vshrn_n to <1 x i64>
+ %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
+ %4 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+ ret <8 x i16> %4
+}
+
+define <4 x i32> @test_vshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vshrn_high_n_s64
+; CHECK: shrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+ %1 = bitcast <2 x i32> %a to <1 x i64>
+ %2 = ashr <2 x i64> %b, <i64 19, i64 19>
+ %vshrn_n = trunc <2 x i64> %2 to <2 x i32>
+ %3 = bitcast <2 x i32> %vshrn_n to <1 x i64>
+ %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
+ %4 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+ ret <4 x i32> %4
+}
+
+define <16 x i8> @test_vshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vshrn_high_n_u16
+; CHECK: shrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+ %1 = lshr <8 x i16> %b, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+ %vshrn_n = trunc <8 x i16> %1 to <8 x i8>
+ %2 = bitcast <8 x i8> %a to <1 x i64>
+ %3 = bitcast <8 x i8> %vshrn_n to <1 x i64>
+ %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
+ %4 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+ ret <16 x i8> %4
+}
+
+define <8 x i16> @test_vshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vshrn_high_n_u32
+; CHECK: shrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+ %1 = lshr <4 x i32> %b, <i32 9, i32 9, i32 9, i32 9>
+ %vshrn_n = trunc <4 x i32> %1 to <4 x i16>
+ %2 = bitcast <4 x i16> %a to <1 x i64>
+ %3 = bitcast <4 x i16> %vshrn_n to <1 x i64>
+ %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
+ %4 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+ ret <8 x i16> %4
+}
+
+define <4 x i32> @test_vshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vshrn_high_n_u64
+; CHECK: shrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+ %1 = bitcast <2 x i32> %a to <1 x i64>
+ %2 = lshr <2 x i64> %b, <i64 19, i64 19>
+ %vshrn_n = trunc <2 x i64> %2 to <2 x i32>
+ %3 = bitcast <2 x i32> %vshrn_n to <1 x i64>
+ %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
+ %4 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+ ret <4 x i32> %4
+}
+
+define <16 x i8> @test_vqshrun_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vqshrun_high_n_s16
+; CHECK: sqshrun2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+ %vqshrun = tail call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> %b, i32 3)
+ %1 = bitcast <8 x i8> %a to <1 x i64>
+ %2 = bitcast <8 x i8> %vqshrun to <1 x i64>
+ %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+ %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+ ret <16 x i8> %3
+}
+
+define <8 x i16> @test_vqshrun_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vqshrun_high_n_s32
+; CHECK: sqshrun2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+ %vqshrun = tail call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> %b, i32 9)
+ %1 = bitcast <4 x i16> %a to <1 x i64>
+ %2 = bitcast <4 x i16> %vqshrun to <1 x i64>
+ %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+ %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+ ret <8 x i16> %3
+}
+
+define <4 x i32> @test_vqshrun_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vqshrun_high_n_s64
+; CHECK: sqshrun2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+ %1 = bitcast <2 x i32> %a to <1 x i64>
+ %vqshrun = tail call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> %b, i32 19)
+ %2 = bitcast <2 x i32> %vqshrun to <1 x i64>
+ %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+ %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+ ret <4 x i32> %3
+}
+
+define <16 x i8> @test_vrshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vrshrn_high_n_s16
+; CHECK: rshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+ %vrshrn = tail call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %b, i32 3)
+ %1 = bitcast <8 x i8> %a to <1 x i64>
+ %2 = bitcast <8 x i8> %vrshrn to <1 x i64>
+ %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+ %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+ ret <16 x i8> %3
+}
+
+define <8 x i16> @test_vrshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vrshrn_high_n_s32
+; CHECK: rshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+ %vrshrn = tail call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> %b, i32 9)
+ %1 = bitcast <4 x i16> %a to <1 x i64>
+ %2 = bitcast <4 x i16> %vrshrn to <1 x i64>
+ %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+ %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+ ret <8 x i16> %3
+}
+
+define <4 x i32> @test_vrshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vrshrn_high_n_s64
+; CHECK: rshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+ %1 = bitcast <2 x i32> %a to <1 x i64>
+ %vrshrn = tail call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> %b, i32 19)
+ %2 = bitcast <2 x i32> %vrshrn to <1 x i64>
+ %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+ %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+ ret <4 x i32> %3
+}
+
+define <16 x i8> @test_vqrshrun_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vqrshrun_high_n_s16
+; CHECK: sqrshrun2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+ %vqrshrun = tail call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> %b, i32 3)
+ %1 = bitcast <8 x i8> %a to <1 x i64>
+ %2 = bitcast <8 x i8> %vqrshrun to <1 x i64>
+ %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+ %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+ ret <16 x i8> %3
+}
+
+define <8 x i16> @test_vqrshrun_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vqrshrun_high_n_s32
+; CHECK: sqrshrun2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+ %vqrshrun = tail call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> %b, i32 9)
+ %1 = bitcast <4 x i16> %a to <1 x i64>
+ %2 = bitcast <4 x i16> %vqrshrun to <1 x i64>
+ %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+ %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+ ret <8 x i16> %3
+}
+
+define <4 x i32> @test_vqrshrun_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vqrshrun_high_n_s64
+; CHECK: sqrshrun2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+ %1 = bitcast <2 x i32> %a to <1 x i64>
+ %vqrshrun = tail call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> %b, i32 19)
+ %2 = bitcast <2 x i32> %vqrshrun to <1 x i64>
+ %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+ %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+ ret <4 x i32> %3
+}
+
+define <16 x i8> @test_vqshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vqshrn_high_n_s16
+; CHECK: sqshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+ %vqshrn = tail call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> %b, i32 3)
+ %1 = bitcast <8 x i8> %a to <1 x i64>
+ %2 = bitcast <8 x i8> %vqshrn to <1 x i64>
+ %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+ %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+ ret <16 x i8> %3
+}
+
+define <8 x i16> @test_vqshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vqshrn_high_n_s32
+; CHECK: sqshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+ %vqshrn = tail call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> %b, i32 9)
+ %1 = bitcast <4 x i16> %a to <1 x i64>
+ %2 = bitcast <4 x i16> %vqshrn to <1 x i64>
+ %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+ %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+ ret <8 x i16> %3
+}
+
+define <4 x i32> @test_vqshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vqshrn_high_n_s64
+; CHECK: sqshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+ %1 = bitcast <2 x i32> %a to <1 x i64>
+ %vqshrn = tail call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> %b, i32 19)
+ %2 = bitcast <2 x i32> %vqshrn to <1 x i64>
+ %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+ %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+ ret <4 x i32> %3
+}
+
+define <16 x i8> @test_vqshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vqshrn_high_n_u16
+; CHECK: uqshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+ %vqshrn = tail call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> %b, i32 3)
+ %1 = bitcast <8 x i8> %a to <1 x i64>
+ %2 = bitcast <8 x i8> %vqshrn to <1 x i64>
+ %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+ %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+ ret <16 x i8> %3
+}
+
+define <8 x i16> @test_vqshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vqshrn_high_n_u32
+; CHECK: uqshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+ %vqshrn = tail call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> %b, i32 9)
+ %1 = bitcast <4 x i16> %a to <1 x i64>
+ %2 = bitcast <4 x i16> %vqshrn to <1 x i64>
+ %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+ %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+ ret <8 x i16> %3
+}
+
+define <4 x i32> @test_vqshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vqshrn_high_n_u64
+; CHECK: uqshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+ %1 = bitcast <2 x i32> %a to <1 x i64>
+ %vqshrn = tail call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> %b, i32 19)
+ %2 = bitcast <2 x i32> %vqshrn to <1 x i64>
+ %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+ %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+ ret <4 x i32> %3
+}
+
+define <16 x i8> @test_vqrshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vqrshrn_high_n_s16
+; CHECK: sqrshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+ %vqrshrn = tail call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> %b, i32 3)
+ %1 = bitcast <8 x i8> %a to <1 x i64>
+ %2 = bitcast <8 x i8> %vqrshrn to <1 x i64>
+ %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+ %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+ ret <16 x i8> %3
+}
+
+define <8 x i16> @test_vqrshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vqrshrn_high_n_s32
+; CHECK: sqrshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+ %vqrshrn = tail call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> %b, i32 9)
+ %1 = bitcast <4 x i16> %a to <1 x i64>
+ %2 = bitcast <4 x i16> %vqrshrn to <1 x i64>
+ %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+ %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+ ret <8 x i16> %3
+}
+
+define <4 x i32> @test_vqrshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vqrshrn_high_n_s64
+; CHECK: sqrshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+ %1 = bitcast <2 x i32> %a to <1 x i64>
+ %vqrshrn = tail call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> %b, i32 19)
+ %2 = bitcast <2 x i32> %vqrshrn to <1 x i64>
+ %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+ %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+ ret <4 x i32> %3
+}
+
+define <16 x i8> @test_vqrshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vqrshrn_high_n_u16
+; CHECK: uqrshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+ %vqrshrn = tail call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> %b, i32 3)
+ %1 = bitcast <8 x i8> %a to <1 x i64>
+ %2 = bitcast <8 x i8> %vqrshrn to <1 x i64>
+ %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+ %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+ ret <16 x i8> %3
+}
+
+define <8 x i16> @test_vqrshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vqrshrn_high_n_u32
+; CHECK: uqrshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+ %vqrshrn = tail call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> %b, i32 9)
+ %1 = bitcast <4 x i16> %a to <1 x i64>
+ %2 = bitcast <4 x i16> %vqrshrn to <1 x i64>
+ %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+ %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+ ret <8 x i16> %3
+}
+
+define <4 x i32> @test_vqrshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vqrshrn_high_n_u64
+; CHECK: uqrshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+ %1 = bitcast <2 x i32> %a to <1 x i64>
+ %vqrshrn = tail call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> %b, i32 19)
+ %2 = bitcast <2 x i32> %vqrshrn to <1 x i64>
+ %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+ %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+ ret <4 x i32> %3
+}
+
+
+
+declare <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64>, i32)
+
+declare <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64>, i32)
+
+declare <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64>, i32)
+
+declare <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64>, i32)
+
+declare <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64>, i32)
+
+declare <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64>, i32)
+
+declare <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64>, i32)
+
+declare <2 x float> @llvm.aarch64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32)
+
+declare <4 x float> @llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32)
+
+declare <2 x double> @llvm.aarch64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64>, i32)
+
+declare <2 x float> @llvm.aarch64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32)
+
+declare <4 x float> @llvm.aarch64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32)
+
+declare <2 x double> @llvm.aarch64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float>, i32)
+
+declare <4 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float>, i32)
+
+declare <2 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float>, i32)
+
+declare <4 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float>, i32)
+
+declare <2 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double>, i32)
+
+define <1 x i64> @test_vcvt_n_s64_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vcvt_n_s64_f64
+; CHECK: fcvtzs d{{[0-9]+}}, d{{[0-9]+}}, #64
+ %1 = tail call <1 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v1i64.v1f64(<1 x double> %a, i32 64)
+ ret <1 x i64> %1
+}
+
+define <1 x i64> @test_vcvt_n_u64_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vcvt_n_u64_f64
+; CHECK: fcvtzu d{{[0-9]+}}, d{{[0-9]+}}, #64
+ %1 = tail call <1 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v1i64.v1f64(<1 x double> %a, i32 64)
+ ret <1 x i64> %1
+}
+
+define <1 x double> @test_vcvt_n_f64_s64(<1 x i64> %a) {
+; CHECK-LABEL: test_vcvt_n_f64_s64
+; CHECK: scvtf d{{[0-9]+}}, d{{[0-9]+}}, #64
+ %1 = tail call <1 x double> @llvm.aarch64.neon.vcvtfxs2fp.v1f64.v1i64(<1 x i64> %a, i32 64)
+ ret <1 x double> %1
+}
+
+define <1 x double> @test_vcvt_n_f64_u64(<1 x i64> %a) {
+; CHECK-LABEL: test_vcvt_n_f64_u64
+; CHECK: ucvtf d{{[0-9]+}}, d{{[0-9]+}}, #64
+ %1 = tail call <1 x double> @llvm.aarch64.neon.vcvtfxu2fp.v1f64.v1i64(<1 x i64> %a, i32 64)
+ ret <1 x double> %1
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v1i64.v1f64(<1 x double>, i32)
+declare <1 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v1i64.v1f64(<1 x double>, i32)
+declare <1 x double> @llvm.aarch64.neon.vcvtfxs2fp.v1f64.v1i64(<1 x i64>, i32)
+declare <1 x double> @llvm.aarch64.neon.vcvtfxu2fp.v1f64.v1i64(<1 x i64>, i32)
diff --git a/test/CodeGen/AArch64/neon-simd-vget.ll b/test/CodeGen/AArch64/arm64-neon-simd-vget.ll
index 6474499..87f3956 100644
--- a/test/CodeGen/AArch64/neon-simd-vget.ll
+++ b/test/CodeGen/AArch64/arm64-neon-simd-vget.ll
@@ -1,8 +1,8 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
define <8 x i8> @test_vget_high_s8(<16 x i8> %a) {
; CHECK-LABEL: test_vget_high_s8:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <8 x i8> %shuffle.i
@@ -10,7 +10,7 @@ entry:
define <4 x i16> @test_vget_high_s16(<8 x i16> %a) {
; CHECK-LABEL: test_vget_high_s16:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
ret <4 x i16> %shuffle.i
@@ -18,7 +18,7 @@ entry:
define <2 x i32> @test_vget_high_s32(<4 x i32> %a) {
; CHECK-LABEL: test_vget_high_s32:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
ret <2 x i32> %shuffle.i
@@ -26,7 +26,7 @@ entry:
define <1 x i64> @test_vget_high_s64(<2 x i64> %a) {
; CHECK-LABEL: test_vget_high_s64:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
entry:
%shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> <i32 1>
ret <1 x i64> %shuffle.i
@@ -34,7 +34,7 @@ entry:
define <8 x i8> @test_vget_high_u8(<16 x i8> %a) {
; CHECK-LABEL: test_vget_high_u8:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <8 x i8> %shuffle.i
@@ -42,7 +42,7 @@ entry:
define <4 x i16> @test_vget_high_u16(<8 x i16> %a) {
; CHECK-LABEL: test_vget_high_u16:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
ret <4 x i16> %shuffle.i
@@ -50,7 +50,7 @@ entry:
define <2 x i32> @test_vget_high_u32(<4 x i32> %a) {
; CHECK-LABEL: test_vget_high_u32:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
ret <2 x i32> %shuffle.i
@@ -58,7 +58,7 @@ entry:
define <1 x i64> @test_vget_high_u64(<2 x i64> %a) {
; CHECK-LABEL: test_vget_high_u64:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
entry:
%shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> <i32 1>
ret <1 x i64> %shuffle.i
@@ -66,7 +66,7 @@ entry:
define <1 x i64> @test_vget_high_p64(<2 x i64> %a) {
; CHECK-LABEL: test_vget_high_p64:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
entry:
%shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> <i32 1>
ret <1 x i64> %shuffle.i
@@ -74,7 +74,7 @@ entry:
define <4 x i16> @test_vget_high_f16(<8 x i16> %a) {
; CHECK-LABEL: test_vget_high_f16:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
ret <4 x i16> %shuffle.i
@@ -82,7 +82,7 @@ entry:
define <2 x float> @test_vget_high_f32(<4 x float> %a) {
; CHECK-LABEL: test_vget_high_f32:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
entry:
%shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 2, i32 3>
ret <2 x float> %shuffle.i
@@ -90,7 +90,7 @@ entry:
define <8 x i8> @test_vget_high_p8(<16 x i8> %a) {
; CHECK-LABEL: test_vget_high_p8:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <8 x i8> %shuffle.i
@@ -98,7 +98,7 @@ entry:
define <4 x i16> @test_vget_high_p16(<8 x i16> %a) {
; CHECK-LABEL: test_vget_high_p16:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
ret <4 x i16> %shuffle.i
@@ -106,7 +106,7 @@ entry:
define <1 x double> @test_vget_high_f64(<2 x double> %a) {
; CHECK-LABEL: test_vget_high_f64:
-; CHECK: dup d0, {{v[0-9]+}}.d[1]
+; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8
entry:
%shuffle.i = shufflevector <2 x double> %a, <2 x double> undef, <1 x i32> <i32 1>
ret <1 x double> %shuffle.i
diff --git a/test/CodeGen/AArch64/arm64-neon-v1i1-setcc.ll b/test/CodeGen/AArch64/arm64-neon-v1i1-setcc.ll
new file mode 100644
index 0000000..74e3af8
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-v1i1-setcc.ll
@@ -0,0 +1,74 @@
+; RUN: llc %s -o - -verify-machineinstrs -mtriple=arm64-none-linux-gnu | FileCheck %s
+
+; This is the analogue of AArch64's file of the same name. It's mostly testing
+; some form of correct lowering occurs, the tests are a little artificial but I
+; strongly suspect there's room for improved CodeGen (FIXME).
+
+define i64 @test_sext_extr_cmp_0(<1 x i64> %v1, <1 x i64> %v2) {
+; CHECK-LABEL: test_sext_extr_cmp_0:
+; CHECK: cmp {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: cset
+ %1 = icmp sge <1 x i64> %v1, %v2
+ %2 = extractelement <1 x i1> %1, i32 0
+ %vget_lane = sext i1 %2 to i64
+ ret i64 %vget_lane
+}
+
+define i64 @test_sext_extr_cmp_1(<1 x double> %v1, <1 x double> %v2) {
+; CHECK-LABEL: test_sext_extr_cmp_1:
+; CHECK: fcmp {{d[0-9]+}}, {{d[0-9]+}}
+ %1 = fcmp oeq <1 x double> %v1, %v2
+ %2 = extractelement <1 x i1> %1, i32 0
+ %vget_lane = sext i1 %2 to i64
+ ret i64 %vget_lane
+}
+
+define <1 x i64> @test_select_v1i1_0(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) {
+; CHECK-LABEL: test_select_v1i1_0:
+; CHECK: cmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+; CHECK: bic v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+ %1 = icmp eq <1 x i64> %v1, %v2
+ %res = select <1 x i1> %1, <1 x i64> zeroinitializer, <1 x i64> %v3
+ ret <1 x i64> %res
+}
+
+define <1 x i64> @test_select_v1i1_1(<1 x double> %v1, <1 x double> %v2, <1 x i64> %v3) {
+; CHECK-LABEL: test_select_v1i1_1:
+; CHECK: fcmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+; CHECK: bic v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+ %1 = fcmp oeq <1 x double> %v1, %v2
+ %res = select <1 x i1> %1, <1 x i64> zeroinitializer, <1 x i64> %v3
+ ret <1 x i64> %res
+}
+
+define <1 x double> @test_select_v1i1_2(<1 x i64> %v1, <1 x i64> %v2, <1 x double> %v3) {
+; CHECK-LABEL: test_select_v1i1_2:
+; CHECK: cmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+; CHECK: bic v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+ %1 = icmp eq <1 x i64> %v1, %v2
+ %res = select <1 x i1> %1, <1 x double> zeroinitializer, <1 x double> %v3
+ ret <1 x double> %res
+}
+
+define <1 x i64> @test_select_v1i1_3(i64 %lhs, i64 %rhs, <1 x i64> %v3) {
+; CHECK-LABEL: test_select_v1i1_3:
+; CHECK: cmp {{x[0-9]+}}, {{x[0-9]+}}
+ %tst = icmp eq i64 %lhs, %rhs
+ %evil = insertelement <1 x i1> undef, i1 %tst, i32 0
+ %res = select <1 x i1> %evil, <1 x i64> zeroinitializer, <1 x i64> %v3
+ ret <1 x i64> %res
+}
+
+define i32 @test_br_extr_cmp(<1 x i64> %v1, <1 x i64> %v2) {
+; CHECK-LABEL: test_br_extr_cmp:
+; CHECK: cmp x{{[0-9]+}}, x{{[0-9]+}}
+ %1 = icmp eq <1 x i64> %v1, %v2
+ %2 = extractelement <1 x i1> %1, i32 0
+ br i1 %2, label %if.end, label %if.then
+
+if.then:
+ ret i32 0;
+
+if.end:
+ ret i32 1;
+}
diff --git a/test/CodeGen/AArch64/neon-vector-list-spill.ll b/test/CodeGen/AArch64/arm64-neon-vector-list-spill.ll
index 3ab69c4..8262fe4 100644
--- a/test/CodeGen/AArch64/neon-vector-list-spill.ll
+++ b/test/CodeGen/AArch64/arm64-neon-vector-list-spill.ll
@@ -1,16 +1,16 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
; FIXME: We should not generate ld/st for such register spill/fill, because the
; test case seems very simple and the register pressure is not high. If the
; spill/fill algorithm is optimized, this test case may not be triggered. And
; then we can delete it.
-define i32 @spill.DPairReg(i8* %arg1, i32 %arg2) {
+define i32 @spill.DPairReg(i32* %arg1, i32 %arg2) {
; CHECK-LABEL: spill.DPairReg:
-; CHECK: ld2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
+; CHECK: ld2 { v{{[0-9]+}}.2s, v{{[0-9]+}}.2s }, [{{x[0-9]+|sp}}]
+; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
+; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
entry:
- %vld = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32(i8* %arg1, i32 4)
+ %vld = tail call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0i32(i32* %arg1)
%cmp = icmp eq i32 %arg2, 0
br i1 %cmp, label %if.then, label %if.end
@@ -24,13 +24,13 @@ if.end:
ret i32 %res
}
-define i16 @spill.DTripleReg(i8* %arg1, i32 %arg2) {
+define i16 @spill.DTripleReg(i16* %arg1, i32 %arg2) {
; CHECK-LABEL: spill.DTripleReg:
-; CHECK: ld3 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
+; CHECK: ld3 { v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h }, [{{x[0-9]+|sp}}]
+; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
+; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
entry:
- %vld = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8* %arg1, i32 4)
+ %vld = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0i16(i16* %arg1)
%cmp = icmp eq i32 %arg2, 0
br i1 %cmp, label %if.then, label %if.end
@@ -44,13 +44,13 @@ if.end:
ret i16 %res
}
-define i16 @spill.DQuadReg(i8* %arg1, i32 %arg2) {
+define i16 @spill.DQuadReg(i16* %arg1, i32 %arg2) {
; CHECK-LABEL: spill.DQuadReg:
-; CHECK: ld4 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
+; CHECK: ld4 { v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h }, [{{x[0-9]+|sp}}]
+; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
+; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
entry:
- %vld = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16(i8* %arg1, i32 4)
+ %vld = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0i16(i16* %arg1)
%cmp = icmp eq i32 %arg2, 0
br i1 %cmp, label %if.then, label %if.end
@@ -64,13 +64,13 @@ if.end:
ret i16 %res
}
-define i32 @spill.QPairReg(i8* %arg1, i32 %arg2) {
+define i32 @spill.QPairReg(i32* %arg1, i32 %arg2) {
; CHECK-LABEL: spill.QPairReg:
-; CHECK: ld3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
+; CHECK: ld2 { v{{[0-9]+}}.4s, v{{[0-9]+}}.4s }, [{{x[0-9]+|sp}}]
+; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
+; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
entry:
- %vld = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32(i8* %arg1, i32 4)
+ %vld = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i32(i32* %arg1)
%cmp = icmp eq i32 %arg2, 0
br i1 %cmp, label %if.then, label %if.end
@@ -84,13 +84,13 @@ if.end:
ret i32 %res
}
-define float @spill.QTripleReg(i8* %arg1, i32 %arg2) {
+define float @spill.QTripleReg(float* %arg1, i32 %arg2) {
; CHECK-LABEL: spill.QTripleReg:
-; CHECK: ld3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
+; CHECK: ld3 { v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s }, [{{x[0-9]+|sp}}]
+; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
+; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
entry:
- %vld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32(i8* %arg1, i32 4)
+ %vld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0f32(float* %arg1)
%cmp = icmp eq i32 %arg2, 0
br i1 %cmp, label %if.then, label %if.end
@@ -106,11 +106,11 @@ if.end:
define i8 @spill.QQuadReg(i8* %arg1, i32 %arg2) {
; CHECK-LABEL: spill.QQuadReg:
-; CHECK: ld4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
-; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
+; CHECK: ld4 { v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b }, [{{x[0-9]+|sp}}]
+; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
+; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
entry:
- %vld = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8* %arg1, i32 4)
+ %vld = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0i8(i8* %arg1)
%cmp = icmp eq i32 %arg2, 0
br i1 %cmp, label %if.then, label %if.end
@@ -124,12 +124,12 @@ if.end:
ret i8 %res
}
-declare { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32(i8*, i32)
-declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8*, i32)
-declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16(i8*, i32)
-declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32(i8*, i32)
-declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32(i8*, i32)
-declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8*, i32)
+declare { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0i32(i32*)
+declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0i16(i16*)
+declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0i16(i16*)
+declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i32(i32*)
+declare { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0f32(float*)
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0i8(i8*)
declare void @foo()
@@ -138,8 +138,8 @@ declare void @foo()
; spill/fill algorithm is optimized, this test case may not be triggered. And
; then we can delete it.
; check the spill for Register Class QPair_with_qsub_0_in_FPR128Lo
-define <8 x i16> @test_2xFPR128Lo(i64 %got, i8* %ptr, <1 x i64> %a) {
- tail call void @llvm.arm.neon.vst2lane.v1i64(i8* %ptr, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i32 0, i32 8)
+define <8 x i16> @test_2xFPR128Lo(i64 %got, i64* %ptr, <1 x i64> %a) {
+ tail call void @llvm.aarch64.neon.st2lane.v1i64.p0i64(<1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i64 0, i64* %ptr)
tail call void @foo()
%sv = shufflevector <1 x i64> zeroinitializer, <1 x i64> %a, <2 x i32> <i32 0, i32 1>
%1 = bitcast <2 x i64> %sv to <8 x i16>
@@ -149,8 +149,8 @@ define <8 x i16> @test_2xFPR128Lo(i64 %got, i8* %ptr, <1 x i64> %a) {
}
; check the spill for Register Class QTriple_with_qsub_0_in_FPR128Lo
-define <8 x i16> @test_3xFPR128Lo(i64 %got, i8* %ptr, <1 x i64> %a) {
- tail call void @llvm.arm.neon.vst3lane.v1i64(i8* %ptr, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i32 0, i32 8)
+define <8 x i16> @test_3xFPR128Lo(i64 %got, i64* %ptr, <1 x i64> %a) {
+ tail call void @llvm.aarch64.neon.st3lane.v1i64.p0i64(<1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i64 0, i64* %ptr)
tail call void @foo()
%sv = shufflevector <1 x i64> zeroinitializer, <1 x i64> %a, <2 x i32> <i32 0, i32 1>
%1 = bitcast <2 x i64> %sv to <8 x i16>
@@ -160,8 +160,8 @@ define <8 x i16> @test_3xFPR128Lo(i64 %got, i8* %ptr, <1 x i64> %a) {
}
; check the spill for Register Class QQuad_with_qsub_0_in_FPR128Lo
-define <8 x i16> @test_4xFPR128Lo(i64 %got, i8* %ptr, <1 x i64> %a) {
- tail call void @llvm.arm.neon.vst4lane.v1i64(i8* %ptr, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i32 0, i32 8)
+define <8 x i16> @test_4xFPR128Lo(i64 %got, i64* %ptr, <1 x i64> %a) {
+ tail call void @llvm.aarch64.neon.st4lane.v1i64.p0i64(<1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i64 0, i64* %ptr)
tail call void @foo()
%sv = shufflevector <1 x i64> zeroinitializer, <1 x i64> %a, <2 x i32> <i32 0, i32 1>
%1 = bitcast <2 x i64> %sv to <8 x i16>
@@ -170,6 +170,6 @@ define <8 x i16> @test_4xFPR128Lo(i64 %got, i8* %ptr, <1 x i64> %a) {
ret <8 x i16> %3
}
-declare void @llvm.arm.neon.vst2lane.v1i64(i8*, <1 x i64>, <1 x i64>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32) \ No newline at end of file
+declare void @llvm.aarch64.neon.st2lane.v1i64.p0i64(<1 x i64>, <1 x i64>, i64, i64*)
+declare void @llvm.aarch64.neon.st3lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64, i64*)
+declare void @llvm.aarch64.neon.st4lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64, i64*)
diff --git a/test/CodeGen/AArch64/arm64-patchpoint.ll b/test/CodeGen/AArch64/arm64-patchpoint.ll
new file mode 100644
index 0000000..039cdfc
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-patchpoint.ll
@@ -0,0 +1,171 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin -enable-misched=0 -mcpu=cyclone | FileCheck %s
+
+; Trivial patchpoint codegen
+;
+define i64 @trivial_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+; CHECK-LABEL: trivial_patchpoint_codegen:
+; CHECK: movz x16, #0xdead, lsl #32
+; CHECK-NEXT: movk x16, #0xbeef, lsl #16
+; CHECK-NEXT: movk x16, #0xcafe
+; CHECK-NEXT: blr x16
+; CHECK: movz x16, #0xdead, lsl #32
+; CHECK-NEXT: movk x16, #0xbeef, lsl #16
+; CHECK-NEXT: movk x16, #0xcaff
+; CHECK-NEXT: blr x16
+; CHECK: ret
+ %resolveCall2 = inttoptr i64 244837814094590 to i8*
+ %result = tail call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 2, i32 20, i8* %resolveCall2, i32 4, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
+ %resolveCall3 = inttoptr i64 244837814094591 to i8*
+ tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 3, i32 20, i8* %resolveCall3, i32 2, i64 %p1, i64 %result)
+ ret i64 %result
+}
+
+; Caller frame metadata with stackmaps. This should not be optimized
+; as a leaf function.
+;
+; CHECK-LABEL: caller_meta_leaf
+; CHECK: mov x29, sp
+; CHECK-NEXT: sub sp, sp, #32
+; CHECK: Ltmp
+; CHECK: mov sp, x29
+; CHECK: ret
+
+define void @caller_meta_leaf() {
+entry:
+ %metadata = alloca i64, i32 3, align 8
+ store i64 11, i64* %metadata
+ store i64 12, i64* %metadata
+ store i64 13, i64* %metadata
+ call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 4, i32 0, i64* %metadata)
+ ret void
+}
+
+; Test the webkit_jscc calling convention.
+; One argument will be passed in register, the other will be pushed on the stack.
+; Return value in x0.
+define void @jscall_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+; CHECK-LABEL: jscall_patchpoint_codegen:
+; CHECK: Ltmp
+; CHECK: str x{{.+}}, [sp]
+; CHECK-NEXT: mov x0, x{{.+}}
+; CHECK: Ltmp
+; CHECK-NEXT: movz x16, #0xffff, lsl #32
+; CHECK-NEXT: movk x16, #0xdead, lsl #16
+; CHECK-NEXT: movk x16, #0xbeef
+; CHECK-NEXT: blr x16
+ %resolveCall2 = inttoptr i64 281474417671919 to i8*
+ %result = tail call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %resolveCall2, i32 2, i64 %p4, i64 %p2)
+ %resolveCall3 = inttoptr i64 244837814038255 to i8*
+ tail call webkit_jscc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 20, i8* %resolveCall3, i32 2, i64 %p4, i64 %result)
+ ret void
+}
+
+; Test if the arguments are properly aligned and that we don't store undef arguments.
+define i64 @jscall_patchpoint_codegen2(i64 %callee) {
+entry:
+; CHECK-LABEL: jscall_patchpoint_codegen2:
+; CHECK: Ltmp
+; CHECK: orr w{{.+}}, wzr, #0x6
+; CHECK-NEXT: str x{{.+}}, [sp, #24]
+; CHECK-NEXT: orr w{{.+}}, wzr, #0x4
+; CHECK-NEXT: str w{{.+}}, [sp, #16]
+; CHECK-NEXT: orr w{{.+}}, wzr, #0x2
+; CHECK-NEXT: str x{{.+}}, [sp]
+; CHECK: Ltmp
+; CHECK-NEXT: movz x16, #0xffff, lsl #32
+; CHECK-NEXT: movk x16, #0xdead, lsl #16
+; CHECK-NEXT: movk x16, #0xbeef
+; CHECK-NEXT: blr x16
+ %call = inttoptr i64 281474417671919 to i8*
+ %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 20, i8* %call, i32 6, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6)
+ ret i64 %result
+}
+
+; Test if the arguments are properly aligned and that we don't store undef arguments.
+define i64 @jscall_patchpoint_codegen3(i64 %callee) {
+entry:
+; CHECK-LABEL: jscall_patchpoint_codegen3:
+; CHECK: Ltmp
+; CHECK: movz w{{.+}}, #0xa
+; CHECK-NEXT: str x{{.+}}, [sp, #48]
+; CHECK-NEXT: orr w{{.+}}, wzr, #0x8
+; CHECK-NEXT: str w{{.+}}, [sp, #36]
+; CHECK-NEXT: orr w{{.+}}, wzr, #0x6
+; CHECK-NEXT: str x{{.+}}, [sp, #24]
+; CHECK-NEXT: orr w{{.+}}, wzr, #0x4
+; CHECK-NEXT: str w{{.+}}, [sp, #16]
+; CHECK-NEXT: orr w{{.+}}, wzr, #0x2
+; CHECK-NEXT: str x{{.+}}, [sp]
+; CHECK: Ltmp
+; CHECK-NEXT: movz x16, #0xffff, lsl #32
+; CHECK-NEXT: movk x16, #0xdead, lsl #16
+; CHECK-NEXT: movk x16, #0xbeef
+; CHECK-NEXT: blr x16
+ %call = inttoptr i64 281474417671919 to i8*
+ %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 20, i8* %call, i32 10, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6, i32 undef, i32 8, i32 undef, i64 10)
+ ret i64 %result
+}
+
+; Test patchpoints reusing the same TargetConstant.
+; <rdar:15390785> Assertion failed: (CI.getNumArgOperands() >= NumArgs + 4)
+; There is no way to verify this, since it depends on memory allocation.
+; But I think it's useful to include as a working example.
+define i64 @testLowerConstant(i64 %arg, i64 %tmp2, i64 %tmp10, i64* %tmp33, i64 %tmp79) {
+entry:
+ %tmp80 = add i64 %tmp79, -16
+ %tmp81 = inttoptr i64 %tmp80 to i64*
+ %tmp82 = load i64* %tmp81, align 8
+ tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 14, i32 8, i64 %arg, i64 %tmp2, i64 %tmp10, i64 %tmp82)
+ tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 15, i32 32, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp82)
+ %tmp83 = load i64* %tmp33, align 8
+ %tmp84 = add i64 %tmp83, -24
+ %tmp85 = inttoptr i64 %tmp84 to i64*
+ %tmp86 = load i64* %tmp85, align 8
+ tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 17, i32 8, i64 %arg, i64 %tmp10, i64 %tmp86)
+ tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 18, i32 32, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp86)
+ ret i64 10
+}
+
+; Test small patchpoints that don't emit calls.
+define void @small_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+; CHECK-LABEL: small_patchpoint_codegen:
+; CHECK: Ltmp
+; CHECK: nop
+; CHECK-NEXT: nop
+; CHECK-NEXT: nop
+; CHECK-NEXT: nop
+; CHECK-NEXT: nop
+; CHECK-NEXT: ldp
+; CHECK-NEXT: ret
+ %result = tail call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* null, i32 2, i64 %p1, i64 %p2)
+ ret void
+}
+
+; Test that scratch registers are spilled around patchpoints
+; CHECK: InlineAsm End
+; CHECK-NEXT: mov x{{[0-9]+}}, x16
+; CHECK-NEXT: mov x{{[0-9]+}}, x17
+; CHECK-NEXT: Ltmp
+; CHECK-NEXT: nop
+define void @clobberScratch(i32* %p) {
+ %v = load i32* %p
+ tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"() nounwind
+ tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 5, i32 20, i8* null, i32 0, i32* %p, i32 %v)
+ store i32 %v, i32* %p
+ ret void
+}
+
+declare void @llvm.experimental.stackmap(i64, i32, ...)
+declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
+declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
+
+; CHECK-LABEL: test_i16:
+; CHECK: ldrh [[BREG:w[0-9]+]], [sp]
+; CHECK: add w0, w0, [[BREG]]
+define webkit_jscc i16 @test_i16(i16 zeroext %a, i16 zeroext %b) {
+ %sum = add i16 %a, %b
+ ret i16 %sum
+}
diff --git a/test/CodeGen/AArch64/arm64-pic-local-symbol.ll b/test/CodeGen/AArch64/arm64-pic-local-symbol.ll
new file mode 100644
index 0000000..627e741
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-pic-local-symbol.ll
@@ -0,0 +1,22 @@
+; RUN: llc -mtriple=arm64-unknown-linux-gnu -relocation-model=pic < %s | FileCheck %s
+
+@a = internal unnamed_addr global i32 0, align 4
+@.str = private unnamed_addr constant [6 x i8] c"test\0A\00", align 1
+
+define i32 @get() {
+; CHECK: get:
+; CHECK: adrp x{{[0-9]+}}, a
+; CHECK-NEXT: ldr w{{[0-9]+}}, [x{{[0-9]}}, :lo12:a]
+ %res = load i32* @a, align 4
+ ret i32 %res
+}
+
+define void @foo() nounwind {
+; CHECK: foo:
+; CHECK: adrp x{{[0-9]}}, .L.str
+; CHECK-NEXT: add x{{[0-9]}}, x{{[0-9]}}, :lo12:.L.str
+ tail call void @bar(i8* getelementptr inbounds ([6 x i8]* @.str, i64 0, i64 0))
+ ret void
+}
+
+declare void @bar(i8*)
diff --git a/test/CodeGen/AArch64/arm64-platform-reg.ll b/test/CodeGen/AArch64/arm64-platform-reg.ll
new file mode 100644
index 0000000..651c793
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-platform-reg.ll
@@ -0,0 +1,26 @@
+; RUN: llc -mtriple=arm64-apple-ios -o - %s | FileCheck %s --check-prefix=CHECK-DARWIN
+; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
+
+; x18 is reserved as a platform register on Darwin but not on other
+; systems. Create loads of register pressure and make sure this is respected.
+
+; Also, fp must always refer to a valid frame record, even if it's not the one
+; of the current function, so it shouldn't be used either.
+
+@var = global [30 x i64] zeroinitializer
+
+define void @keep_live() {
+ %val = load volatile [30 x i64]* @var
+ store volatile [30 x i64] %val, [30 x i64]* @var
+
+; CHECK: ldr x18
+; CHECK: str x18
+
+; CHECK-DARWIN-NOT: ldr fp
+; CHECK-DARWIN-NOT: ldr x18
+; CHECK-DARWIN: Spill
+; CHECK-DARWIN-NOT: ldr fp
+; CHECK-DARWIN-NOT: ldr x18
+; CHECK-DARWIN: ret
+ ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-popcnt.ll b/test/CodeGen/AArch64/arm64-popcnt.ll
new file mode 100644
index 0000000..2afade2
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-popcnt.ll
@@ -0,0 +1,43 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define i32 @cnt32_advsimd(i32 %x) nounwind readnone {
+ %cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
+ ret i32 %cnt
+; CHECK: fmov s0, w0
+; CHECK: cnt.8b v0, v0
+; CHECK: uaddlv.8b h0, v0
+; CHECK: fmov w0, s0
+; CHECK: ret
+}
+
+define i64 @cnt64_advsimd(i64 %x) nounwind readnone {
+ %cnt = tail call i64 @llvm.ctpop.i64(i64 %x)
+ ret i64 %cnt
+; CHECK: fmov d0, x0
+; CHECK: cnt.8b v0, v0
+; CHECK: uaddlv.8b h0, v0
+; CHECK: fmov w0, s0
+; CHECK: ret
+}
+
+; Do not use AdvSIMD when -mno-implicit-float is specified.
+; rdar://9473858
+
+define i32 @cnt32(i32 %x) nounwind readnone noimplicitfloat {
+ %cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
+ ret i32 %cnt
+; CHECK-LABEL: cnt32:
+; CHECK-NOT 16b
+; CHECK: ret
+}
+
+define i64 @cnt64(i64 %x) nounwind readnone noimplicitfloat {
+ %cnt = tail call i64 @llvm.ctpop.i64(i64 %x)
+ ret i64 %cnt
+; CHECK-LABEL: cnt64:
+; CHECK-NOT 16b
+; CHECK: ret
+}
+
+declare i32 @llvm.ctpop.i32(i32) nounwind readnone
+declare i64 @llvm.ctpop.i64(i64) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-prefetch.ll b/test/CodeGen/AArch64/arm64-prefetch.ll
new file mode 100644
index 0000000..b2e06ed
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-prefetch.ll
@@ -0,0 +1,88 @@
+; RUN: llc %s -march arm64 -o - | FileCheck %s
+
+@a = common global i32* null, align 8
+
+define void @test(i32 %i, i32 %j) nounwind ssp {
+entry:
+ ; CHECK: @test
+ %j.addr = alloca i32, align 4
+ store i32 %j, i32* %j.addr, align 4, !tbaa !0
+ %tmp = bitcast i32* %j.addr to i8*
+ ; CHECK: prfum pldl1strm
+ call void @llvm.prefetch(i8* %tmp, i32 0, i32 0, i32 1)
+ ; CHECK: prfum pldl3keep
+ call void @llvm.prefetch(i8* %tmp, i32 0, i32 1, i32 1)
+ ; CHECK: prfum pldl2keep
+ call void @llvm.prefetch(i8* %tmp, i32 0, i32 2, i32 1)
+ ; CHECK: prfum pldl1keep
+ call void @llvm.prefetch(i8* %tmp, i32 0, i32 3, i32 1)
+
+ ; CHECK: prfum pstl1strm
+ call void @llvm.prefetch(i8* %tmp, i32 1, i32 0, i32 1)
+ ; CHECK: prfum pstl3keep
+ call void @llvm.prefetch(i8* %tmp, i32 1, i32 1, i32 1)
+ ; CHECK: prfum pstl2keep
+ call void @llvm.prefetch(i8* %tmp, i32 1, i32 2, i32 1)
+ ; CHECK: prfum pstl1keep
+ call void @llvm.prefetch(i8* %tmp, i32 1, i32 3, i32 1)
+
+ %tmp1 = load i32* %j.addr, align 4, !tbaa !0
+ %add = add nsw i32 %tmp1, %i
+ %idxprom = sext i32 %add to i64
+ %tmp2 = load i32** @a, align 8, !tbaa !3
+ %arrayidx = getelementptr inbounds i32* %tmp2, i64 %idxprom
+ %tmp3 = bitcast i32* %arrayidx to i8*
+
+ ; CHECK: prfm pldl1strm
+ call void @llvm.prefetch(i8* %tmp3, i32 0, i32 0, i32 1)
+ %tmp4 = load i32** @a, align 8, !tbaa !3
+ %arrayidx3 = getelementptr inbounds i32* %tmp4, i64 %idxprom
+ %tmp5 = bitcast i32* %arrayidx3 to i8*
+
+ ; CHECK: prfm pldl3keep
+ call void @llvm.prefetch(i8* %tmp5, i32 0, i32 1, i32 1)
+ %tmp6 = load i32** @a, align 8, !tbaa !3
+ %arrayidx6 = getelementptr inbounds i32* %tmp6, i64 %idxprom
+ %tmp7 = bitcast i32* %arrayidx6 to i8*
+
+ ; CHECK: prfm pldl2keep
+ call void @llvm.prefetch(i8* %tmp7, i32 0, i32 2, i32 1)
+ %tmp8 = load i32** @a, align 8, !tbaa !3
+ %arrayidx9 = getelementptr inbounds i32* %tmp8, i64 %idxprom
+ %tmp9 = bitcast i32* %arrayidx9 to i8*
+
+ ; CHECK: prfm pldl1keep
+ call void @llvm.prefetch(i8* %tmp9, i32 0, i32 3, i32 1)
+ %tmp10 = load i32** @a, align 8, !tbaa !3
+ %arrayidx12 = getelementptr inbounds i32* %tmp10, i64 %idxprom
+ %tmp11 = bitcast i32* %arrayidx12 to i8*
+
+ ; CHECK: prfm pstl1strm
+ call void @llvm.prefetch(i8* %tmp11, i32 1, i32 0, i32 1)
+ %tmp12 = load i32** @a, align 8, !tbaa !3
+ %arrayidx15 = getelementptr inbounds i32* %tmp12, i64 %idxprom
+ %tmp13 = bitcast i32* %arrayidx15 to i8*
+
+ ; CHECK: prfm pstl3keep
+ call void @llvm.prefetch(i8* %tmp13, i32 1, i32 1, i32 1)
+ %tmp14 = load i32** @a, align 8, !tbaa !3
+ %arrayidx18 = getelementptr inbounds i32* %tmp14, i64 %idxprom
+ %tmp15 = bitcast i32* %arrayidx18 to i8*
+
+ ; CHECK: prfm pstl2keep
+ call void @llvm.prefetch(i8* %tmp15, i32 1, i32 2, i32 1)
+ %tmp16 = load i32** @a, align 8, !tbaa !3
+ %arrayidx21 = getelementptr inbounds i32* %tmp16, i64 %idxprom
+ %tmp17 = bitcast i32* %arrayidx21 to i8*
+
+ ; CHECK: prfm pstl1keep
+ call void @llvm.prefetch(i8* %tmp17, i32 1, i32 3, i32 1)
+ ret void
+}
+
+declare void @llvm.prefetch(i8* nocapture, i32, i32, i32) nounwind
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!3 = metadata !{metadata !"any pointer", metadata !1}
diff --git a/test/CodeGen/AArch64/arm64-promote-const.ll b/test/CodeGen/AArch64/arm64-promote-const.ll
new file mode 100644
index 0000000..380ff55
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-promote-const.ll
@@ -0,0 +1,255 @@
+; Disable machine cse to stress the different path of the algorithm.
+; Otherwise, we always fall in the simple case, i.e., only one definition.
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -disable-machine-cse -aarch64-stress-promote-const -mcpu=cyclone | FileCheck -check-prefix=PROMOTED %s
+; The REGULAR run just checks that the inputs passed to promote const expose
+; the appropriate patterns.
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -disable-machine-cse -aarch64-promote-const=false -mcpu=cyclone | FileCheck -check-prefix=REGULAR %s
+
+%struct.uint8x16x4_t = type { [4 x <16 x i8>] }
+
+; Constant is a structure
+define %struct.uint8x16x4_t @test1() {
+; PROMOTED-LABEL: test1:
+; Promote constant has created a big constant for the whole structure
+; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], __PromotedConst@PAGE
+; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], __PromotedConst@PAGEOFF
+; Destination registers are defined by the ABI
+; PROMOTED-NEXT: ldp q0, q1, {{\[}}[[BASEADDR]]]
+; PROMOTED-NEXT: ldp q2, q3, {{\[}}[[BASEADDR]], #32]
+; PROMOTED-NEXT: ret
+
+; REGULAR-LABEL: test1:
+; Regular access is quite bad, it performs 4 loads, one for each chunk of
+; the structure
+; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL:lCP.*]]@PAGE
+; Destination registers are defined by the ABI
+; REGULAR: ldr q0, {{\[}}[[PAGEADDR]], [[CSTLABEL]]@PAGEOFF]
+; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL:lCP.*]]@PAGE
+; REGULAR: ldr q1, {{\[}}[[PAGEADDR]], [[CSTLABEL]]@PAGEOFF]
+; REGULAR: adrp [[PAGEADDR2:x[0-9]+]], [[CSTLABEL2:lCP.*]]@PAGE
+; REGULAR: ldr q2, {{\[}}[[PAGEADDR2]], [[CSTLABEL2]]@PAGEOFF]
+; REGULAR: adrp [[PAGEADDR3:x[0-9]+]], [[CSTLABEL3:lCP.*]]@PAGE
+; REGULAR: ldr q3, {{\[}}[[PAGEADDR3]], [[CSTLABEL3]]@PAGEOFF]
+; REGULAR-NEXT: ret
+entry:
+ ret %struct.uint8x16x4_t { [4 x <16 x i8>] [<16 x i8> <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>, <16 x i8> <i8 32, i8 124, i8 121, i8 120, i8 8, i8 117, i8 -56, i8 113, i8 -76, i8 110, i8 -53, i8 107, i8 7, i8 105, i8 103, i8 102>, <16 x i8> <i8 -24, i8 99, i8 -121, i8 97, i8 66, i8 95, i8 24, i8 93, i8 6, i8 91, i8 12, i8 89, i8 39, i8 87, i8 86, i8 85>, <16 x i8> <i8 -104, i8 83, i8 -20, i8 81, i8 81, i8 80, i8 -59, i8 78, i8 73, i8 77, i8 -37, i8 75, i8 122, i8 74, i8 37, i8 73>] }
+}
+
+; Two different uses of the same constant in the same basic block
+define <16 x i8> @test2(<16 x i8> %arg) {
+entry:
+; PROMOTED-LABEL: test2:
+; In stress mode, constant vector are promoted
+; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], [[CSTV1:__PromotedConst[0-9]+]]@PAGE
+; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], [[CSTV1]]@PAGEOFF
+; PROMOTED: ldr q[[REGNUM:[0-9]+]], {{\[}}[[BASEADDR]]]
+; Destination register is defined by ABI
+; PROMOTED-NEXT: add.16b v0, v0, v[[REGNUM]]
+; PROMOTED-NEXT: mla.16b v0, v0, v[[REGNUM]]
+; PROMOTED-NEXT: ret
+
+; REGULAR-LABEL: test2:
+; Regular access is strickly the same as promoted access.
+; The difference is that the address (and thus the space in memory) is not
+; shared between constants
+; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL:lCP.*]]@PAGE
+; REGULAR: ldr q[[REGNUM:[0-9]+]], {{\[}}[[PAGEADDR]], [[CSTLABEL]]@PAGEOFF]
+; Destination register is defined by ABI
+; REGULAR-NEXT: add.16b v0, v0, v[[REGNUM]]
+; REGULAR-NEXT: mla.16b v0, v0, v[[REGNUM]]
+; REGULAR-NEXT: ret
+ %add.i = add <16 x i8> %arg, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
+ %mul.i = mul <16 x i8> %add.i, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
+ %add.i9 = add <16 x i8> %add.i, %mul.i
+ ret <16 x i8> %add.i9
+}
+
+; Two different uses of the sane constant in two different basic blocks,
+; one dominates the other
+define <16 x i8> @test3(<16 x i8> %arg, i32 %path) {
+; PROMOTED-LABEL: test3:
+; In stress mode, constant vector are promoted
+; Since, the constant is the same as the previous function,
+; the same address must be used
+; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], [[CSTV1]]@PAGE
+; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], [[CSTV1]]@PAGEOFF
+; PROMOTED-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[BASEADDR]]]
+; Destination register is defined by ABI
+; PROMOTED-NEXT: add.16b v0, v0, v[[REGNUM]]
+; PROMOTED-NEXT: cbnz w0, [[LABEL:LBB.*]]
+; Next BB
+; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], [[CSTV2:__PromotedConst[0-9]+]]@PAGE
+; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], [[CSTV2]]@PAGEOFF
+; PROMOTED-NEXT: ldr q[[REGNUM]], {{\[}}[[BASEADDR]]]
+; Next BB
+; PROMOTED-NEXT: [[LABEL]]:
+; PROMOTED-NEXT: mul.16b [[DESTV:v[0-9]+]], v0, v[[REGNUM]]
+; PROMOTED-NEXT: add.16b v0, v0, [[DESTV]]
+; PROMOTED-NEXT: ret
+
+; REGULAR-LABEL: test3:
+; Regular mode does not elimitate common sub expression by its own.
+; In other words, the same loads appears several times.
+; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL1:lCP.*]]@PAGE
+; REGULAR-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[PAGEADDR]], [[CSTLABEL1]]@PAGEOFF]
+; Destination register is defined by ABI
+; REGULAR-NEXT: add.16b v0, v0, v[[REGNUM]]
+; REGULAR-NEXT: cbz w0, [[LABELelse:LBB.*]]
+; Next BB
+; Redundant load
+; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL1]]@PAGE
+; REGULAR-NEXT: ldr q[[REGNUM]], {{\[}}[[PAGEADDR]], [[CSTLABEL1]]@PAGEOFF]
+; REGULAR-NEXT: b [[LABELend:LBB.*]]
+; Next BB
+; REGULAR-NEXT: [[LABELelse]]
+; REGULAR-NEXT: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL2:lCP.*]]@PAGE
+; REGULAR-NEXT: ldr q[[REGNUM]], {{\[}}[[PAGEADDR]], [[CSTLABEL2]]@PAGEOFF]
+; Next BB
+; REGULAR-NEXT: [[LABELend]]:
+; REGULAR-NEXT: mul.16b [[DESTV:v[0-9]+]], v0, v[[REGNUM]]
+; REGULAR-NEXT: add.16b v0, v0, [[DESTV]]
+; REGULAR-NEXT: ret
+entry:
+ %add.i = add <16 x i8> %arg, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
+ %tobool = icmp eq i32 %path, 0
+ br i1 %tobool, label %if.else, label %if.then
+
+if.then: ; preds = %entry
+ %mul.i13 = mul <16 x i8> %add.i, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
+ br label %if.end
+
+if.else: ; preds = %entry
+ %mul.i = mul <16 x i8> %add.i, <i8 -24, i8 99, i8 -121, i8 97, i8 66, i8 95, i8 24, i8 93, i8 6, i8 91, i8 12, i8 89, i8 39, i8 87, i8 86, i8 85>
+ br label %if.end
+
+if.end: ; preds = %if.else, %if.then
+ %ret2.0 = phi <16 x i8> [ %mul.i13, %if.then ], [ %mul.i, %if.else ]
+ %add.i12 = add <16 x i8> %add.i, %ret2.0
+ ret <16 x i8> %add.i12
+}
+
+; Two different uses of the sane constant in two different basic blocks,
+; none dominates the other
+define <16 x i8> @test4(<16 x i8> %arg, i32 %path) {
+; PROMOTED-LABEL: test4:
+; In stress mode, constant vector are promoted
+; Since, the constant is the same as the previous function,
+; the same address must be used
+; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], [[CSTV1]]@PAGE
+; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], [[CSTV1]]@PAGEOFF
+; PROMOTED-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[BASEADDR]]]
+; Destination register is defined by ABI
+; PROMOTED-NEXT: add.16b v0, v0, v[[REGNUM]]
+; PROMOTED-NEXT: cbz w0, [[LABEL:LBB.*]]
+; Next BB
+; PROMOTED: mul.16b v0, v0, v[[REGNUM]]
+; Next BB
+; PROMOTED-NEXT: [[LABEL]]:
+; PROMOTED-NEXT: ret
+
+
+; REGULAR-LABEL: test4:
+; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL3:lCP.*]]@PAGE
+; REGULAR-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[PAGEADDR]], [[CSTLABEL3]]@PAGEOFF]
+; Destination register is defined by ABI
+; REGULAR-NEXT: add.16b v0, v0, v[[REGNUM]]
+; REGULAR-NEXT: cbz w0, [[LABEL:LBB.*]]
+; Next BB
+; Redundant expression
+; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL3]]@PAGE
+; REGULAR-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[PAGEADDR]], [[CSTLABEL3]]@PAGEOFF]
+; Destination register is defined by ABI
+; REGULAR-NEXT: mul.16b v0, v0, v[[REGNUM]]
+; Next BB
+; REGULAR-NEXT: [[LABEL]]:
+; REGULAR-NEXT: ret
+entry:
+ %add.i = add <16 x i8> %arg, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
+ %tobool = icmp eq i32 %path, 0
+ br i1 %tobool, label %if.end, label %if.then
+
+if.then: ; preds = %entry
+ %mul.i = mul <16 x i8> %add.i, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
+ br label %if.end
+
+if.end: ; preds = %entry, %if.then
+ %ret.0 = phi <16 x i8> [ %mul.i, %if.then ], [ %add.i, %entry ]
+ ret <16 x i8> %ret.0
+}
+
+; Two different uses of the sane constant in two different basic blocks,
+; one is in a phi.
+define <16 x i8> @test5(<16 x i8> %arg, i32 %path) {
+; PROMOTED-LABEL: test5:
+; In stress mode, constant vector are promoted
+; Since, the constant is the same as the previous function,
+; the same address must be used
+; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], [[CSTV1]]@PAGE
+; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], [[CSTV1]]@PAGEOFF
+; PROMOTED-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[BASEADDR]]]
+; PROMOTED-NEXT: cbz w0, [[LABEL:LBB.*]]
+; Next BB
+; PROMOTED: add.16b [[DESTV:v[0-9]+]], v0, v[[REGNUM]]
+; PROMOTED-NEXT: mul.16b v[[REGNUM]], [[DESTV]], v[[REGNUM]]
+; Next BB
+; PROMOTED-NEXT: [[LABEL]]:
+; PROMOTED-NEXT: mul.16b [[TMP1:v[0-9]+]], v[[REGNUM]], v[[REGNUM]]
+; PROMOTED-NEXT: mul.16b [[TMP2:v[0-9]+]], [[TMP1]], [[TMP1]]
+; PROMOTED-NEXT: mul.16b [[TMP3:v[0-9]+]], [[TMP2]], [[TMP2]]
+; PROMOTED-NEXT: mul.16b v0, [[TMP3]], [[TMP3]]
+; PROMOTED-NEXT: ret
+
+; REGULAR-LABEL: test5:
+; REGULAR: cbz w0, [[LABELelse:LBB.*]]
+; Next BB
+; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL:lCP.*]]@PAGE
+; REGULAR-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[PAGEADDR]], [[CSTLABEL]]@PAGEOFF]
+; REGULAR-NEXT: add.16b [[DESTV:v[0-9]+]], v0, v[[REGNUM]]
+; REGULAR-NEXT: mul.16b v[[DESTREGNUM:[0-9]+]], [[DESTV]], v[[REGNUM]]
+; REGULAR-NEXT: b [[LABELend:LBB.*]]
+; Next BB
+; REGULAR-NEXT: [[LABELelse]]
+; REGULAR-NEXT: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL:lCP.*]]@PAGE
+; REGULAR-NEXT: ldr q[[DESTREGNUM]], {{\[}}[[PAGEADDR]], [[CSTLABEL]]@PAGEOFF]
+; Next BB
+; REGULAR-NEXT: [[LABELend]]:
+; REGULAR-NEXT: mul.16b [[TMP1:v[0-9]+]], v[[DESTREGNUM]], v[[DESTREGNUM]]
+; REGULAR-NEXT: mul.16b [[TMP2:v[0-9]+]], [[TMP1]], [[TMP1]]
+; REGULAR-NEXT: mul.16b [[TMP3:v[0-9]+]], [[TMP2]], [[TMP2]]
+; REGULAR-NEXT: mul.16b v0, [[TMP3]], [[TMP3]]
+; REGULAR-NEXT: ret
+entry:
+ %tobool = icmp eq i32 %path, 0
+ br i1 %tobool, label %if.end, label %if.then
+
+if.then: ; preds = %entry
+ %add.i = add <16 x i8> %arg, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
+ %mul.i26 = mul <16 x i8> %add.i, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
+ br label %if.end
+
+if.end: ; preds = %entry, %if.then
+ %ret.0 = phi <16 x i8> [ %mul.i26, %if.then ], [ <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>, %entry ]
+ %mul.i25 = mul <16 x i8> %ret.0, %ret.0
+ %mul.i24 = mul <16 x i8> %mul.i25, %mul.i25
+ %mul.i23 = mul <16 x i8> %mul.i24, %mul.i24
+ %mul.i = mul <16 x i8> %mul.i23, %mul.i23
+ ret <16 x i8> %mul.i
+}
+
+define void @accessBig(i64* %storage) {
+; PROMOTED-LABEL: accessBig:
+; PROMOTED: adrp
+; PROMOTED: ret
+ %addr = bitcast i64* %storage to <1 x i80>*
+ store <1 x i80> <i80 483673642326615442599424>, <1 x i80>* %addr
+ ret void
+}
+
+define void @asmStatement() {
+; PROMOTED-LABEL: asmStatement:
+; PROMOTED-NOT: adrp
+; PROMOTED: ret
+ call void asm sideeffect "bfxil w0, w0, $0, $1", "i,i"(i32 28, i32 4)
+ ret void
+}
+
diff --git a/test/CodeGen/AArch64/arm64-redzone.ll b/test/CodeGen/AArch64/arm64-redzone.ll
new file mode 100644
index 0000000..9b0c384
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-redzone.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -march=arm64 -aarch64-redzone | FileCheck %s
+
+define i32 @foo(i32 %a, i32 %b) nounwind ssp {
+; CHECK-LABEL: foo:
+; CHECK-NOT: sub sp, sp
+; CHECK: ret
+ %a.addr = alloca i32, align 4
+ %b.addr = alloca i32, align 4
+ %x = alloca i32, align 4
+ store i32 %a, i32* %a.addr, align 4
+ store i32 %b, i32* %b.addr, align 4
+ %tmp = load i32* %a.addr, align 4
+ %tmp1 = load i32* %b.addr, align 4
+ %add = add nsw i32 %tmp, %tmp1
+ store i32 %add, i32* %x, align 4
+ %tmp2 = load i32* %x, align 4
+ ret i32 %tmp2
+}
diff --git a/test/CodeGen/AArch64/arm64-reg-copy-noneon.ll b/test/CodeGen/AArch64/arm64-reg-copy-noneon.ll
new file mode 100644
index 0000000..29255ef
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-reg-copy-noneon.ll
@@ -0,0 +1,20 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=-neon < %s | FileCheck %s
+
+define float @copy_FPR32(float %a, float %b) {
+;CHECK-LABEL: copy_FPR32:
+;CHECK: fmov s0, s1
+ ret float %b;
+}
+
+define double @copy_FPR64(double %a, double %b) {
+;CHECK-LABEL: copy_FPR64:
+;CHECK: fmov d0, d1
+ ret double %b;
+}
+
+define fp128 @copy_FPR128(fp128 %a, fp128 %b) {
+;CHECK-LABEL: copy_FPR128:
+;CHECK: str q1, [sp, #-16]!
+;CHECK-NEXT: ldr q0, [sp, #16]!
+ ret fp128 %b;
+}
diff --git a/test/CodeGen/AArch64/arm64-register-offset-addressing.ll b/test/CodeGen/AArch64/arm64-register-offset-addressing.ll
new file mode 100644
index 0000000..045712b
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-register-offset-addressing.ll
@@ -0,0 +1,145 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
+
+define i8 @test_64bit_add(i16* %a, i64 %b) {
+; CHECK-LABEL: test_64bit_add:
+; CHECK: lsl [[REG:x[0-9]+]], x1, #1
+; CHECK: ldrb w0, [x0, [[REG]]]
+; CHECK: ret
+ %tmp1 = getelementptr inbounds i16* %a, i64 %b
+ %tmp2 = load i16* %tmp1
+ %tmp3 = trunc i16 %tmp2 to i8
+ ret i8 %tmp3
+}
+
+; These tests are trying to form SEXT and ZEXT operations that never leave i64
+; space, to make sure LLVM can adapt the offset register correctly.
+define void @ldst_8bit(i8* %base, i64 %offset) minsize {
+; CHECK-LABEL: ldst_8bit:
+
+ %off32.sext.tmp = shl i64 %offset, 32
+ %off32.sext = ashr i64 %off32.sext.tmp, 32
+ %addr8_sxtw = getelementptr i8* %base, i64 %off32.sext
+ %val8_sxtw = load volatile i8* %addr8_sxtw
+ %val32_signed = sext i8 %val8_sxtw to i32
+ store volatile i32 %val32_signed, i32* @var_32bit
+; CHECK: ldrsb {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+
+ %addrint_uxtw = ptrtoint i8* %base to i64
+ %offset_uxtw = and i64 %offset, 4294967295
+ %addrint1_uxtw = add i64 %addrint_uxtw, %offset_uxtw
+ %addr_uxtw = inttoptr i64 %addrint1_uxtw to i8*
+ %val8_uxtw = load volatile i8* %addr_uxtw
+ %newval8 = add i8 %val8_uxtw, 1
+ store volatile i8 %newval8, i8* @var_8bit
+; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+
+ ret void
+}
+
+
+define void @ldst_16bit(i16* %base, i64 %offset) minsize {
+; CHECK-LABEL: ldst_16bit:
+
+ %addrint_uxtw = ptrtoint i16* %base to i64
+ %offset_uxtw = and i64 %offset, 4294967295
+ %addrint1_uxtw = add i64 %addrint_uxtw, %offset_uxtw
+ %addr_uxtw = inttoptr i64 %addrint1_uxtw to i16*
+ %val8_uxtw = load volatile i16* %addr_uxtw
+ %newval8 = add i16 %val8_uxtw, 1
+ store volatile i16 %newval8, i16* @var_16bit
+; CHECK: ldrh {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+
+ %base_sxtw = ptrtoint i16* %base to i64
+ %offset_sxtw.tmp = shl i64 %offset, 32
+ %offset_sxtw = ashr i64 %offset_sxtw.tmp, 32
+ %addrint_sxtw = add i64 %base_sxtw, %offset_sxtw
+ %addr_sxtw = inttoptr i64 %addrint_sxtw to i16*
+ %val16_sxtw = load volatile i16* %addr_sxtw
+ %val64_signed = sext i16 %val16_sxtw to i64
+ store volatile i64 %val64_signed, i64* @var_64bit
+; CHECK: ldrsh {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+
+
+ %base_uxtwN = ptrtoint i16* %base to i64
+ %offset_uxtwN = and i64 %offset, 4294967295
+ %offset2_uxtwN = shl i64 %offset_uxtwN, 1
+ %addrint_uxtwN = add i64 %base_uxtwN, %offset2_uxtwN
+ %addr_uxtwN = inttoptr i64 %addrint_uxtwN to i16*
+ %val32 = load volatile i32* @var_32bit
+ %val16_trunc32 = trunc i32 %val32 to i16
+ store volatile i16 %val16_trunc32, i16* %addr_uxtwN
+; CHECK: strh {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #1]
+ ret void
+}
+
+define void @ldst_32bit(i32* %base, i64 %offset) minsize {
+; CHECK-LABEL: ldst_32bit:
+
+ %addrint_uxtw = ptrtoint i32* %base to i64
+ %offset_uxtw = and i64 %offset, 4294967295
+ %addrint1_uxtw = add i64 %addrint_uxtw, %offset_uxtw
+ %addr_uxtw = inttoptr i64 %addrint1_uxtw to i32*
+ %val32_uxtw = load volatile i32* %addr_uxtw
+ %newval32 = add i32 %val32_uxtw, 1
+ store volatile i32 %newval32, i32* @var_32bit
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+
+ %base_sxtw = ptrtoint i32* %base to i64
+ %offset_sxtw.tmp = shl i64 %offset, 32
+ %offset_sxtw = ashr i64 %offset_sxtw.tmp, 32
+ %addrint_sxtw = add i64 %base_sxtw, %offset_sxtw
+ %addr_sxtw = inttoptr i64 %addrint_sxtw to i32*
+ %val32_sxtw = load volatile i32* %addr_sxtw
+ %val64_signed = sext i32 %val32_sxtw to i64
+ store volatile i64 %val64_signed, i64* @var_64bit
+; CHECK: ldrsw {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+
+
+ %base_uxtwN = ptrtoint i32* %base to i64
+ %offset_uxtwN = and i64 %offset, 4294967295
+ %offset2_uxtwN = shl i64 %offset_uxtwN, 2
+ %addrint_uxtwN = add i64 %base_uxtwN, %offset2_uxtwN
+ %addr_uxtwN = inttoptr i64 %addrint_uxtwN to i32*
+ %val32 = load volatile i32* @var_32bit
+ store volatile i32 %val32, i32* %addr_uxtwN
+; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #2]
+ ret void
+}
+
+define void @ldst_64bit(i64* %base, i64 %offset) minsize {
+; CHECK-LABEL: ldst_64bit:
+
+ %addrint_uxtw = ptrtoint i64* %base to i64
+ %offset_uxtw = and i64 %offset, 4294967295
+ %addrint1_uxtw = add i64 %addrint_uxtw, %offset_uxtw
+ %addr_uxtw = inttoptr i64 %addrint1_uxtw to i64*
+ %val64_uxtw = load volatile i64* %addr_uxtw
+ %newval8 = add i64 %val64_uxtw, 1
+ store volatile i64 %newval8, i64* @var_64bit
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+
+ %base_sxtw = ptrtoint i64* %base to i64
+ %offset_sxtw.tmp = shl i64 %offset, 32
+ %offset_sxtw = ashr i64 %offset_sxtw.tmp, 32
+ %addrint_sxtw = add i64 %base_sxtw, %offset_sxtw
+ %addr_sxtw = inttoptr i64 %addrint_sxtw to i64*
+ %val64_sxtw = load volatile i64* %addr_sxtw
+ store volatile i64 %val64_sxtw, i64* @var_64bit
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+
+
+ %base_uxtwN = ptrtoint i64* %base to i64
+ %offset_uxtwN = and i64 %offset, 4294967295
+ %offset2_uxtwN = shl i64 %offset_uxtwN, 3
+ %addrint_uxtwN = add i64 %base_uxtwN, %offset2_uxtwN
+ %addr_uxtwN = inttoptr i64 %addrint_uxtwN to i64*
+ %val64 = load volatile i64* @var_64bit
+ store volatile i64 %val64, i64* %addr_uxtwN
+; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #3]
+ ret void
+}
+
+@var_8bit = global i8 0
+@var_16bit = global i16 0
+@var_32bit = global i32 0
+@var_64bit = global i64 0
diff --git a/test/CodeGen/AArch64/arm64-register-pairing.ll b/test/CodeGen/AArch64/arm64-register-pairing.ll
new file mode 100644
index 0000000..99defb1
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-register-pairing.ll
@@ -0,0 +1,53 @@
+; RUN: llc -mtriple=arm64-apple-ios < %s | FileCheck %s
+;
+; rdar://14075006
+
+define void @odd() nounwind {
+; CHECK-LABEL: odd:
+; CHECK: stp d15, d14, [sp, #-144]!
+; CHECK: stp d13, d12, [sp, #16]
+; CHECK: stp d11, d10, [sp, #32]
+; CHECK: stp d9, d8, [sp, #48]
+; CHECK: stp x28, x27, [sp, #64]
+; CHECK: stp x26, x25, [sp, #80]
+; CHECK: stp x24, x23, [sp, #96]
+; CHECK: stp x22, x21, [sp, #112]
+; CHECK: stp x20, x19, [sp, #128]
+; CHECK: movz x0, #0x2a
+; CHECK: ldp x20, x19, [sp, #128]
+; CHECK: ldp x22, x21, [sp, #112]
+; CHECK: ldp x24, x23, [sp, #96]
+; CHECK: ldp x26, x25, [sp, #80]
+; CHECK: ldp x28, x27, [sp, #64]
+; CHECK: ldp d9, d8, [sp, #48]
+; CHECK: ldp d11, d10, [sp, #32]
+; CHECK: ldp d13, d12, [sp, #16]
+; CHECK: ldp d15, d14, [sp], #144
+ call void asm sideeffect "mov x0, #42", "~{x0},~{x19},~{x21},~{x23},~{x25},~{x27},~{d8},~{d10},~{d12},~{d14}"() nounwind
+ ret void
+}
+
+define void @even() nounwind {
+; CHECK-LABEL: even:
+; CHECK: stp d15, d14, [sp, #-144]!
+; CHECK: stp d13, d12, [sp, #16]
+; CHECK: stp d11, d10, [sp, #32]
+; CHECK: stp d9, d8, [sp, #48]
+; CHECK: stp x28, x27, [sp, #64]
+; CHECK: stp x26, x25, [sp, #80]
+; CHECK: stp x24, x23, [sp, #96]
+; CHECK: stp x22, x21, [sp, #112]
+; CHECK: stp x20, x19, [sp, #128]
+; CHECK: movz x0, #0x2a
+; CHECK: ldp x20, x19, [sp, #128]
+; CHECK: ldp x22, x21, [sp, #112]
+; CHECK: ldp x24, x23, [sp, #96]
+; CHECK: ldp x26, x25, [sp, #80]
+; CHECK: ldp x28, x27, [sp, #64]
+; CHECK: ldp d9, d8, [sp, #48]
+; CHECK: ldp d11, d10, [sp, #32]
+; CHECK: ldp d13, d12, [sp, #16]
+; CHECK: ldp d15, d14, [sp], #144
+ call void asm sideeffect "mov x0, #42", "~{x0},~{x20},~{x22},~{x24},~{x26},~{x28},~{d9},~{d11},~{d13},~{d15}"() nounwind
+ ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-regress-f128csel-flags.ll b/test/CodeGen/AArch64/arm64-regress-f128csel-flags.ll
new file mode 100644
index 0000000..a1daf03
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-regress-f128csel-flags.ll
@@ -0,0 +1,27 @@
+; RUN: llc -march=arm64 -verify-machineinstrs < %s | FileCheck %s
+
+; We used to not mark NZCV as being used in the continuation basic-block
+; when lowering a 128-bit "select" to branches. This meant a subsequent use
+; of the same flags gave an internal fault here.
+
+declare void @foo(fp128)
+
+define double @test_f128csel_flags(i32 %lhs, fp128 %a, fp128 %b, double %l, double %r) nounwind {
+; CHECK: test_f128csel_flags
+
+ %tst = icmp ne i32 %lhs, 42
+ %val = select i1 %tst, fp128 %a, fp128 %b
+; CHECK: cmp w0, #42
+; CHECK: b.eq {{.?LBB0}}
+
+ call void @foo(fp128 %val)
+ %retval = select i1 %tst, double %l, double %r
+
+ ; It's also reasonably important that the actual fcsel comes before the
+ ; function call since bl may corrupt NZCV. We were doing the right thing anyway,
+ ; but just as well test it while we're here.
+; CHECK: fcsel {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, ne
+; CHECK: bl {{_?foo}}
+
+ ret double %retval
+}
diff --git a/test/CodeGen/AArch64/arm64-regress-interphase-shift.ll b/test/CodeGen/AArch64/arm64-regress-interphase-shift.ll
new file mode 100644
index 0000000..fec8933
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-regress-interphase-shift.ll
@@ -0,0 +1,33 @@
+; RUN: llc -march=arm64 -o - %s | FileCheck %s
+
+; This is mostly a "don't assert" test. The type of the RHS of a shift depended
+; on the phase of legalization, which led to the creation of an unexpected and
+; unselectable "rotr" node: (i32 (rotr i32, i64)).
+
+; FIXME: This test is xfailed because it relies on an optimization that has
+; been reverted (see PR17975).
+; XFAIL: *
+
+define void @foo(i64* nocapture %d) {
+; CHECK-LABEL: foo:
+; CHECK: rorv
+ %tmp = load i64* undef, align 8
+ %sub397 = sub i64 0, %tmp
+ %and398 = and i64 %sub397, 4294967295
+ %shr404 = lshr i64 %and398, 0
+ %or405 = or i64 0, %shr404
+ %xor406 = xor i64 %or405, 0
+ %xor417 = xor i64 0, %xor406
+ %xor428 = xor i64 0, %xor417
+ %sub430 = sub i64 %xor417, 0
+ %and431 = and i64 %sub430, 4294967295
+ %and432 = and i64 %xor428, 31
+ %sub433 = sub i64 32, %and432
+ %shl434 = shl i64 %and431, %sub433
+ %shr437 = lshr i64 %and431, %and432
+ %or438 = or i64 %shl434, %shr437
+ %xor439 = xor i64 %or438, %xor428
+ %sub441 = sub i64 %xor439, 0
+ store i64 %sub441, i64* %d, align 8
+ ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-return-vector.ll b/test/CodeGen/AArch64/arm64-return-vector.ll
new file mode 100644
index 0000000..9457d8b
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-return-vector.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+; 2x64 vector should be returned in Q0.
+
+define <2 x double> @test(<2 x double>* %p) nounwind {
+; CHECK: test
+; CHECK: ldr q0, [x0]
+; CHECK: ret
+ %tmp1 = load <2 x double>* %p, align 16
+ ret <2 x double> %tmp1
+}
diff --git a/test/CodeGen/AArch64/arm64-returnaddr.ll b/test/CodeGen/AArch64/arm64-returnaddr.ll
new file mode 100644
index 0000000..285b295
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-returnaddr.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define i8* @rt0(i32 %x) nounwind readnone {
+entry:
+; CHECK-LABEL: rt0:
+; CHECK: mov x0, x30
+; CHECK: ret
+ %0 = tail call i8* @llvm.returnaddress(i32 0)
+ ret i8* %0
+}
+
+define i8* @rt2() nounwind readnone {
+entry:
+; CHECK-LABEL: rt2:
+; CHECK: stp x29, x30, [sp, #-16]!
+; CHECK: mov x29, sp
+; CHECK: ldr x[[REG:[0-9]+]], [x29]
+; CHECK: ldr x[[REG2:[0-9]+]], [x[[REG]]]
+; CHECK: ldr x0, [x[[REG2]], #8]
+; CHECK: ldp x29, x30, [sp], #16
+; CHECK: ret
+ %0 = tail call i8* @llvm.returnaddress(i32 2)
+ ret i8* %0
+}
+
+declare i8* @llvm.returnaddress(i32) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-rev.ll b/test/CodeGen/AArch64/arm64-rev.ll
new file mode 100644
index 0000000..30d9f4f
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-rev.ll
@@ -0,0 +1,235 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define i32 @test_rev_w(i32 %a) nounwind {
+entry:
+; CHECK-LABEL: test_rev_w:
+; CHECK: rev w0, w0
+ %0 = tail call i32 @llvm.bswap.i32(i32 %a)
+ ret i32 %0
+}
+
+define i64 @test_rev_x(i64 %a) nounwind {
+entry:
+; CHECK-LABEL: test_rev_x:
+; CHECK: rev x0, x0
+ %0 = tail call i64 @llvm.bswap.i64(i64 %a)
+ ret i64 %0
+}
+
+declare i32 @llvm.bswap.i32(i32) nounwind readnone
+declare i64 @llvm.bswap.i64(i64) nounwind readnone
+
+define i32 @test_rev16_w(i32 %X) nounwind {
+entry:
+; CHECK-LABEL: test_rev16_w:
+; CHECK: rev16 w0, w0
+ %tmp1 = lshr i32 %X, 8
+ %X15 = bitcast i32 %X to i32
+ %tmp4 = shl i32 %X15, 8
+ %tmp2 = and i32 %tmp1, 16711680
+ %tmp5 = and i32 %tmp4, -16777216
+ %tmp9 = and i32 %tmp1, 255
+ %tmp13 = and i32 %tmp4, 65280
+ %tmp6 = or i32 %tmp5, %tmp2
+ %tmp10 = or i32 %tmp6, %tmp13
+ %tmp14 = or i32 %tmp10, %tmp9
+ ret i32 %tmp14
+}
+
+; 64-bit REV16 is *not* a swap then a 16-bit rotation:
+; 01234567 ->(bswap) 76543210 ->(rotr) 10765432
+; 01234567 ->(rev16) 10325476
+define i64 @test_rev16_x(i64 %a) nounwind {
+entry:
+; CHECK-LABEL: test_rev16_x:
+; CHECK-NOT: rev16 x0, x0
+ %0 = tail call i64 @llvm.bswap.i64(i64 %a)
+ %1 = lshr i64 %0, 16
+ %2 = shl i64 %0, 48
+ %3 = or i64 %1, %2
+ ret i64 %3
+}
+
+define i64 @test_rev32_x(i64 %a) nounwind {
+entry:
+; CHECK-LABEL: test_rev32_x:
+; CHECK: rev32 x0, x0
+ %0 = tail call i64 @llvm.bswap.i64(i64 %a)
+ %1 = lshr i64 %0, 32
+ %2 = shl i64 %0, 32
+ %3 = or i64 %1, %2
+ ret i64 %3
+}
+
+define <8 x i8> @test_vrev64D8(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: test_vrev64D8:
+;CHECK: rev64.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+ ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @test_vrev64D16(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: test_vrev64D16:
+;CHECK: rev64.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+ ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @test_vrev64D32(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: test_vrev64D32:
+;CHECK: rev64.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+ ret <2 x i32> %tmp2
+}
+
+define <2 x float> @test_vrev64Df(<2 x float>* %A) nounwind {
+;CHECK-LABEL: test_vrev64Df:
+;CHECK: rev64.2s
+ %tmp1 = load <2 x float>* %A
+ %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> <i32 1, i32 0>
+ ret <2 x float> %tmp2
+}
+
+define <16 x i8> @test_vrev64Q8(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: test_vrev64Q8:
+;CHECK: rev64.16b
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+ ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @test_vrev64Q16(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: test_vrev64Q16:
+;CHECK: rev64.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+ ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @test_vrev64Q32(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: test_vrev64Q32:
+;CHECK: rev64.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+ ret <4 x i32> %tmp2
+}
+
+define <4 x float> @test_vrev64Qf(<4 x float>* %A) nounwind {
+;CHECK-LABEL: test_vrev64Qf:
+;CHECK: rev64.4s
+ %tmp1 = load <4 x float>* %A
+ %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+ ret <4 x float> %tmp2
+}
+
+define <8 x i8> @test_vrev32D8(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: test_vrev32D8:
+;CHECK: rev32.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+ ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @test_vrev32D16(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: test_vrev32D16:
+;CHECK: rev32.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+ ret <4 x i16> %tmp2
+}
+
+define <16 x i8> @test_vrev32Q8(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: test_vrev32Q8:
+;CHECK: rev32.16b
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+ ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @test_vrev32Q16(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: test_vrev32Q16:
+;CHECK: rev32.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+ ret <8 x i16> %tmp2
+}
+
+define <8 x i8> @test_vrev16D8(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: test_vrev16D8:
+;CHECK: rev16.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+ ret <8 x i8> %tmp2
+}
+
+define <16 x i8> @test_vrev16Q8(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: test_vrev16Q8:
+;CHECK: rev16.16b
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+ ret <16 x i8> %tmp2
+}
+
+; Undef shuffle indices should not prevent matching to VREV:
+
+define <8 x i8> @test_vrev64D8_undef(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: test_vrev64D8_undef:
+;CHECK: rev64.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 7, i32 undef, i32 undef, i32 4, i32 3, i32 2, i32 1, i32 0>
+ ret <8 x i8> %tmp2
+}
+
+define <8 x i16> @test_vrev32Q16_undef(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: test_vrev32Q16_undef:
+;CHECK: rev32.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 5, i32 4, i32 7, i32 undef>
+ ret <8 x i16> %tmp2
+}
+
+; vrev <4 x i16> should use REV32 and not REV64
+define void @test_vrev64(<4 x i16>* nocapture %source, <2 x i16>* nocapture %dst) nounwind ssp {
+; CHECK-LABEL: test_vrev64:
+; CHECK: ldr [[DEST:q[0-9]+]],
+; CHECK: st1.h
+; CHECK: st1.h
+entry:
+ %0 = bitcast <4 x i16>* %source to <8 x i16>*
+ %tmp2 = load <8 x i16>* %0, align 4
+ %tmp3 = extractelement <8 x i16> %tmp2, i32 6
+ %tmp5 = insertelement <2 x i16> undef, i16 %tmp3, i32 0
+ %tmp9 = extractelement <8 x i16> %tmp2, i32 5
+ %tmp11 = insertelement <2 x i16> %tmp5, i16 %tmp9, i32 1
+ store <2 x i16> %tmp11, <2 x i16>* %dst, align 4
+ ret void
+}
+
+; Test vrev of float4
+define void @float_vrev64(float* nocapture %source, <4 x float>* nocapture %dest) nounwind noinline ssp {
+; CHECK: float_vrev64
+; CHECK: ldr [[DEST:q[0-9]+]],
+; CHECK: rev64.4s
+entry:
+ %0 = bitcast float* %source to <4 x float>*
+ %tmp2 = load <4 x float>* %0, align 4
+ %tmp5 = shufflevector <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x float> %tmp2, <4 x i32> <i32 0, i32 7, i32 0, i32 0>
+ %arrayidx8 = getelementptr inbounds <4 x float>* %dest, i32 11
+ store <4 x float> %tmp5, <4 x float>* %arrayidx8, align 4
+ ret void
+}
+
+
+define <4 x i32> @test_vrev32_bswap(<4 x i32> %source) nounwind {
+; CHECK-LABEL: test_vrev32_bswap:
+; CHECK: rev32.16b
+; CHECK-NOT: rev
+; CHECK: ret
+ %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %source)
+ ret <4 x i32> %bswap
+}
+
+declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-rounding.ll b/test/CodeGen/AArch64/arm64-rounding.ll
new file mode 100644
index 0000000..9311144
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-rounding.ll
@@ -0,0 +1,208 @@
+; RUN: llc -O3 < %s -mcpu=cyclone | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64"
+target triple = "arm64-apple-ios6.0.0"
+
+; CHECK: test1
+; CHECK: frintx
+; CHECK: frintm
+define float @test1(float %a) #0 {
+entry:
+ %call = tail call float @floorf(float %a) nounwind readnone
+ ret float %call
+}
+
+declare float @floorf(float) nounwind readnone
+
+; CHECK: test2
+; CHECK: frintx
+; CHECK: frintm
+define double @test2(double %a) #0 {
+entry:
+ %call = tail call double @floor(double %a) nounwind readnone
+ ret double %call
+}
+
+declare double @floor(double) nounwind readnone
+
+; CHECK: test3
+; CHECK: frinti
+define float @test3(float %a) #0 {
+entry:
+ %call = tail call float @nearbyintf(float %a) nounwind readnone
+ ret float %call
+}
+
+declare float @nearbyintf(float) nounwind readnone
+
+; CHECK: test4
+; CHECK: frinti
+define double @test4(double %a) #0 {
+entry:
+ %call = tail call double @nearbyint(double %a) nounwind readnone
+ ret double %call
+}
+
+declare double @nearbyint(double) nounwind readnone
+
+; CHECK: test5
+; CHECK: frintx
+; CHECK: frintp
+define float @test5(float %a) #0 {
+entry:
+ %call = tail call float @ceilf(float %a) nounwind readnone
+ ret float %call
+}
+
+declare float @ceilf(float) nounwind readnone
+
+; CHECK: test6
+; CHECK: frintx
+; CHECK: frintp
+define double @test6(double %a) #0 {
+entry:
+ %call = tail call double @ceil(double %a) nounwind readnone
+ ret double %call
+}
+
+declare double @ceil(double) nounwind readnone
+
+; CHECK: test7
+; CHECK: frintx
+define float @test7(float %a) #0 {
+entry:
+ %call = tail call float @rintf(float %a) nounwind readnone
+ ret float %call
+}
+
+declare float @rintf(float) nounwind readnone
+
+; CHECK: test8
+; CHECK: frintx
+define double @test8(double %a) #0 {
+entry:
+ %call = tail call double @rint(double %a) nounwind readnone
+ ret double %call
+}
+
+declare double @rint(double) nounwind readnone
+
+; CHECK: test9
+; CHECK: frintx
+; CHECK: frintz
+define float @test9(float %a) #0 {
+entry:
+ %call = tail call float @truncf(float %a) nounwind readnone
+ ret float %call
+}
+
+declare float @truncf(float) nounwind readnone
+
+; CHECK: test10
+; CHECK: frintx
+; CHECK: frintz
+define double @test10(double %a) #0 {
+entry:
+ %call = tail call double @trunc(double %a) nounwind readnone
+ ret double %call
+}
+
+declare double @trunc(double) nounwind readnone
+
+; CHECK: test11
+; CHECK: frintx
+; CHECK: frinta
+define float @test11(float %a) #0 {
+entry:
+ %call = tail call float @roundf(float %a) nounwind readnone
+ ret float %call
+}
+
+declare float @roundf(float %a) nounwind readnone
+
+; CHECK: test12
+; CHECK: frintx
+; CHECK: frinta
+define double @test12(double %a) #0 {
+entry:
+ %call = tail call double @round(double %a) nounwind readnone
+ ret double %call
+}
+
+declare double @round(double %a) nounwind readnone
+
+; CHECK: test13
+; CHECK-NOT: frintx
+; CHECK: frintm
+define float @test13(float %a) #1 {
+entry:
+ %call = tail call float @floorf(float %a) nounwind readnone
+ ret float %call
+}
+
+; CHECK: test14
+; CHECK-NOT: frintx
+; CHECK: frintm
+define double @test14(double %a) #1 {
+entry:
+ %call = tail call double @floor(double %a) nounwind readnone
+ ret double %call
+}
+
+; CHECK: test15
+; CHECK-NOT: frintx
+; CHECK: frintp
+define float @test15(float %a) #1 {
+entry:
+ %call = tail call float @ceilf(float %a) nounwind readnone
+ ret float %call
+}
+
+; CHECK: test16
+; CHECK-NOT: frintx
+; CHECK: frintp
+define double @test16(double %a) #1 {
+entry:
+ %call = tail call double @ceil(double %a) nounwind readnone
+ ret double %call
+}
+
+; CHECK: test17
+; CHECK-NOT: frintx
+; CHECK: frintz
+define float @test17(float %a) #1 {
+entry:
+ %call = tail call float @truncf(float %a) nounwind readnone
+ ret float %call
+}
+
+; CHECK: test18
+; CHECK-NOT: frintx
+; CHECK: frintz
+define double @test18(double %a) #1 {
+entry:
+ %call = tail call double @trunc(double %a) nounwind readnone
+ ret double %call
+}
+
+; CHECK: test19
+; CHECK-NOT: frintx
+; CHECK: frinta
+define float @test19(float %a) #1 {
+entry:
+ %call = tail call float @roundf(float %a) nounwind readnone
+ ret float %call
+}
+
+; CHECK: test20
+; CHECK-NOT: frintx
+; CHECK: frinta
+define double @test20(double %a) #1 {
+entry:
+ %call = tail call double @round(double %a) nounwind readnone
+ ret double %call
+}
+
+
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/AArch64/arm64-scaled_iv.ll b/test/CodeGen/AArch64/arm64-scaled_iv.ll
new file mode 100644
index 0000000..987373e
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-scaled_iv.ll
@@ -0,0 +1,38 @@
+; RUN: opt -S -loop-reduce < %s | FileCheck %s
+; Scaling factor in addressing mode are costly.
+; Make loop-reduce prefer unscaled accesses.
+; <rdar://problem/13806271>
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+target triple = "arm64-apple-ios7.0.0"
+
+; Function Attrs: nounwind ssp
+define void @mulDouble(double* nocapture %a, double* nocapture %b, double* nocapture %c) {
+; CHECK: @mulDouble
+entry:
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+; CHECK: [[IV:%[^ ]+]] = phi i64 [ [[IVNEXT:%[^,]+]], %for.body ], [ 0, %entry ]
+; Only one induction variable should have been generated.
+; CHECK-NOT: phi
+ %indvars.iv = phi i64 [ 1, %entry ], [ %indvars.iv.next, %for.body ]
+ %tmp = add nsw i64 %indvars.iv, -1
+ %arrayidx = getelementptr inbounds double* %b, i64 %tmp
+ %tmp1 = load double* %arrayidx, align 8
+; The induction variable should carry the scaling factor: 1 * 8 = 8.
+; CHECK: [[IVNEXT]] = add nuw nsw i64 [[IV]], 8
+ %indvars.iv.next = add i64 %indvars.iv, 1
+ %arrayidx2 = getelementptr inbounds double* %c, i64 %indvars.iv.next
+ %tmp2 = load double* %arrayidx2, align 8
+ %mul = fmul double %tmp1, %tmp2
+ %arrayidx4 = getelementptr inbounds double* %a, i64 %indvars.iv
+ store double %mul, double* %arrayidx4, align 8
+ %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+; Comparison should be 19 * 8 = 152.
+; CHECK: icmp eq i32 {{%[^,]+}}, 152
+ %exitcond = icmp eq i32 %lftr.wideiv, 20
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end: ; preds = %for.body
+ ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-scvt.ll b/test/CodeGen/AArch64/arm64-scvt.ll
new file mode 100644
index 0000000..2e006cf
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-scvt.ll
@@ -0,0 +1,830 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+; rdar://13082402
+
+define float @t1(i32* nocapture %src) nounwind ssp {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: ldr s0, [x0]
+; CHECK: scvtf s0, s0
+ %tmp1 = load i32* %src, align 4
+ %tmp2 = sitofp i32 %tmp1 to float
+ ret float %tmp2
+}
+
+define float @t2(i32* nocapture %src) nounwind ssp {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: ldr s0, [x0]
+; CHECK: ucvtf s0, s0
+ %tmp1 = load i32* %src, align 4
+ %tmp2 = uitofp i32 %tmp1 to float
+ ret float %tmp2
+}
+
+define double @t3(i64* nocapture %src) nounwind ssp {
+entry:
+; CHECK-LABEL: t3:
+; CHECK: ldr d0, [x0]
+; CHECK: scvtf d0, d0
+ %tmp1 = load i64* %src, align 4
+ %tmp2 = sitofp i64 %tmp1 to double
+ ret double %tmp2
+}
+
+define double @t4(i64* nocapture %src) nounwind ssp {
+entry:
+; CHECK-LABEL: t4:
+; CHECK: ldr d0, [x0]
+; CHECK: ucvtf d0, d0
+ %tmp1 = load i64* %src, align 4
+ %tmp2 = uitofp i64 %tmp1 to double
+ ret double %tmp2
+}
+
+; rdar://13136456
+define double @t5(i32* nocapture %src) nounwind ssp optsize {
+entry:
+; CHECK-LABEL: t5:
+; CHECK: ldr [[REG:w[0-9]+]], [x0]
+; CHECK: scvtf d0, [[REG]]
+ %tmp1 = load i32* %src, align 4
+ %tmp2 = sitofp i32 %tmp1 to double
+ ret double %tmp2
+}
+
+; Check that we load in FP register when we want to convert into
+; floating point value.
+; This is much faster than loading on GPR and making the conversion
+; GPR -> FPR.
+; <rdar://problem/14599607>
+;
+; Check the flollowing patterns for signed/unsigned:
+; 1. load with scaled imm to float.
+; 2. load with scaled register to float.
+; 3. load with scaled imm to double.
+; 4. load with scaled register to double.
+; 5. load with unscaled imm to float.
+; 6. load with unscaled imm to double.
+; With loading size: 8, 16, 32, and 64-bits.
+
+; ********* 1. load with scaled imm to float. *********
+define float @fct1(i8* nocapture %sp0) {
+; CHECK-LABEL: fct1:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+ %addr = getelementptr i8* %sp0, i64 1
+ %pix_sp0.0.copyload = load i8* %addr, align 1
+ %val = uitofp i8 %pix_sp0.0.copyload to float
+ %vmull.i = fmul float %val, %val
+ ret float %vmull.i
+}
+
+define float @fct2(i16* nocapture %sp0) {
+; CHECK-LABEL: fct2:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, #2]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+ %addr = getelementptr i16* %sp0, i64 1
+ %pix_sp0.0.copyload = load i16* %addr, align 1
+ %val = uitofp i16 %pix_sp0.0.copyload to float
+ %vmull.i = fmul float %val, %val
+ ret float %vmull.i
+}
+
+define float @fct3(i32* nocapture %sp0) {
+; CHECK-LABEL: fct3:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, #4]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+ %addr = getelementptr i32* %sp0, i64 1
+ %pix_sp0.0.copyload = load i32* %addr, align 1
+ %val = uitofp i32 %pix_sp0.0.copyload to float
+ %vmull.i = fmul float %val, %val
+ ret float %vmull.i
+}
+
+; i64 -> f32 is not supported on floating point unit.
+define float @fct4(i64* nocapture %sp0) {
+; CHECK-LABEL: fct4:
+; CHECK: ldr x[[REGNUM:[0-9]+]], [x0, #8]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], x[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+ %addr = getelementptr i64* %sp0, i64 1
+ %pix_sp0.0.copyload = load i64* %addr, align 1
+ %val = uitofp i64 %pix_sp0.0.copyload to float
+ %vmull.i = fmul float %val, %val
+ ret float %vmull.i
+}
+
+; ********* 2. load with scaled register to float. *********
+define float @fct5(i8* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct5:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, x1]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+ %addr = getelementptr i8* %sp0, i64 %offset
+ %pix_sp0.0.copyload = load i8* %addr, align 1
+ %val = uitofp i8 %pix_sp0.0.copyload to float
+ %vmull.i = fmul float %val, %val
+ ret float %vmull.i
+}
+
+define float @fct6(i16* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct6:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, x1, lsl #1]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+ %addr = getelementptr i16* %sp0, i64 %offset
+ %pix_sp0.0.copyload = load i16* %addr, align 1
+ %val = uitofp i16 %pix_sp0.0.copyload to float
+ %vmull.i = fmul float %val, %val
+ ret float %vmull.i
+}
+
+define float @fct7(i32* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct7:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, x1, lsl #2]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+ %addr = getelementptr i32* %sp0, i64 %offset
+ %pix_sp0.0.copyload = load i32* %addr, align 1
+ %val = uitofp i32 %pix_sp0.0.copyload to float
+ %vmull.i = fmul float %val, %val
+ ret float %vmull.i
+}
+
+; i64 -> f32 is not supported on floating point unit.
+define float @fct8(i64* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct8:
+; CHECK: ldr x[[REGNUM:[0-9]+]], [x0, x1, lsl #3]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], x[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+ %addr = getelementptr i64* %sp0, i64 %offset
+ %pix_sp0.0.copyload = load i64* %addr, align 1
+ %val = uitofp i64 %pix_sp0.0.copyload to float
+ %vmull.i = fmul float %val, %val
+ ret float %vmull.i
+}
+
+
+; ********* 3. load with scaled imm to double. *********
+define double @fct9(i8* nocapture %sp0) {
+; CHECK-LABEL: fct9:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+ %addr = getelementptr i8* %sp0, i64 1
+ %pix_sp0.0.copyload = load i8* %addr, align 1
+ %val = uitofp i8 %pix_sp0.0.copyload to double
+ %vmull.i = fmul double %val, %val
+ ret double %vmull.i
+}
+
+define double @fct10(i16* nocapture %sp0) {
+; CHECK-LABEL: fct10:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, #2]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+ %addr = getelementptr i16* %sp0, i64 1
+ %pix_sp0.0.copyload = load i16* %addr, align 1
+ %val = uitofp i16 %pix_sp0.0.copyload to double
+ %vmull.i = fmul double %val, %val
+ ret double %vmull.i
+}
+
+define double @fct11(i32* nocapture %sp0) {
+; CHECK-LABEL: fct11:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, #4]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+ %addr = getelementptr i32* %sp0, i64 1
+ %pix_sp0.0.copyload = load i32* %addr, align 1
+ %val = uitofp i32 %pix_sp0.0.copyload to double
+ %vmull.i = fmul double %val, %val
+ ret double %vmull.i
+}
+
+define double @fct12(i64* nocapture %sp0) {
+; CHECK-LABEL: fct12:
+; CHECK: ldr d[[REGNUM:[0-9]+]], [x0, #8]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+ %addr = getelementptr i64* %sp0, i64 1
+ %pix_sp0.0.copyload = load i64* %addr, align 1
+ %val = uitofp i64 %pix_sp0.0.copyload to double
+ %vmull.i = fmul double %val, %val
+ ret double %vmull.i
+}
+
+; ********* 4. load with scaled register to double. *********
+define double @fct13(i8* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct13:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, x1]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+ %addr = getelementptr i8* %sp0, i64 %offset
+ %pix_sp0.0.copyload = load i8* %addr, align 1
+ %val = uitofp i8 %pix_sp0.0.copyload to double
+ %vmull.i = fmul double %val, %val
+ ret double %vmull.i
+}
+
+define double @fct14(i16* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct14:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, x1, lsl #1]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+ %addr = getelementptr i16* %sp0, i64 %offset
+ %pix_sp0.0.copyload = load i16* %addr, align 1
+ %val = uitofp i16 %pix_sp0.0.copyload to double
+ %vmull.i = fmul double %val, %val
+ ret double %vmull.i
+}
+
+define double @fct15(i32* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct15:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, x1, lsl #2]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+ %addr = getelementptr i32* %sp0, i64 %offset
+ %pix_sp0.0.copyload = load i32* %addr, align 1
+ %val = uitofp i32 %pix_sp0.0.copyload to double
+ %vmull.i = fmul double %val, %val
+ ret double %vmull.i
+}
+
+define double @fct16(i64* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct16:
+; CHECK: ldr d[[REGNUM:[0-9]+]], [x0, x1, lsl #3]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+ %addr = getelementptr i64* %sp0, i64 %offset
+ %pix_sp0.0.copyload = load i64* %addr, align 1
+ %val = uitofp i64 %pix_sp0.0.copyload to double
+ %vmull.i = fmul double %val, %val
+ ret double %vmull.i
+}
+
+; ********* 5. load with unscaled imm to float. *********
+define float @fct17(i8* nocapture %sp0) {
+entry:
+; CHECK-LABEL: fct17:
+; CHECK: ldur b[[REGNUM:[0-9]+]], [x0, #-1]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+ %bitcast = ptrtoint i8* %sp0 to i64
+ %add = add i64 %bitcast, -1
+ %addr = inttoptr i64 %add to i8*
+ %pix_sp0.0.copyload = load i8* %addr, align 1
+ %val = uitofp i8 %pix_sp0.0.copyload to float
+ %vmull.i = fmul float %val, %val
+ ret float %vmull.i
+}
+
+define float @fct18(i16* nocapture %sp0) {
+; CHECK-LABEL: fct18:
+; CHECK: ldur h[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+ %bitcast = ptrtoint i16* %sp0 to i64
+ %add = add i64 %bitcast, 1
+ %addr = inttoptr i64 %add to i16*
+ %pix_sp0.0.copyload = load i16* %addr, align 1
+ %val = uitofp i16 %pix_sp0.0.copyload to float
+ %vmull.i = fmul float %val, %val
+ ret float %vmull.i
+}
+
+define float @fct19(i32* nocapture %sp0) {
+; CHECK-LABEL: fct19:
+; CHECK: ldur s[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+ %bitcast = ptrtoint i32* %sp0 to i64
+ %add = add i64 %bitcast, 1
+ %addr = inttoptr i64 %add to i32*
+ %pix_sp0.0.copyload = load i32* %addr, align 1
+ %val = uitofp i32 %pix_sp0.0.copyload to float
+ %vmull.i = fmul float %val, %val
+ ret float %vmull.i
+}
+
+; i64 -> f32 is not supported on floating point unit.
+define float @fct20(i64* nocapture %sp0) {
+; CHECK-LABEL: fct20:
+; CHECK: ldur x[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], x[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+ %bitcast = ptrtoint i64* %sp0 to i64
+ %add = add i64 %bitcast, 1
+ %addr = inttoptr i64 %add to i64*
+ %pix_sp0.0.copyload = load i64* %addr, align 1
+ %val = uitofp i64 %pix_sp0.0.copyload to float
+ %vmull.i = fmul float %val, %val
+ ret float %vmull.i
+
+}
+
+; ********* 6. load with unscaled imm to double. *********
+define double @fct21(i8* nocapture %sp0) {
+entry:
+; CHECK-LABEL: fct21:
+; CHECK: ldur b[[REGNUM:[0-9]+]], [x0, #-1]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+ %bitcast = ptrtoint i8* %sp0 to i64
+ %add = add i64 %bitcast, -1
+ %addr = inttoptr i64 %add to i8*
+ %pix_sp0.0.copyload = load i8* %addr, align 1
+ %val = uitofp i8 %pix_sp0.0.copyload to double
+ %vmull.i = fmul double %val, %val
+ ret double %vmull.i
+}
+
+define double @fct22(i16* nocapture %sp0) {
+; CHECK-LABEL: fct22:
+; CHECK: ldur h[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+ %bitcast = ptrtoint i16* %sp0 to i64
+ %add = add i64 %bitcast, 1
+ %addr = inttoptr i64 %add to i16*
+ %pix_sp0.0.copyload = load i16* %addr, align 1
+ %val = uitofp i16 %pix_sp0.0.copyload to double
+ %vmull.i = fmul double %val, %val
+ ret double %vmull.i
+}
+
+define double @fct23(i32* nocapture %sp0) {
+; CHECK-LABEL: fct23:
+; CHECK: ldur s[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+ %bitcast = ptrtoint i32* %sp0 to i64
+ %add = add i64 %bitcast, 1
+ %addr = inttoptr i64 %add to i32*
+ %pix_sp0.0.copyload = load i32* %addr, align 1
+ %val = uitofp i32 %pix_sp0.0.copyload to double
+ %vmull.i = fmul double %val, %val
+ ret double %vmull.i
+}
+
+define double @fct24(i64* nocapture %sp0) {
+; CHECK-LABEL: fct24:
+; CHECK: ldur d[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+ %bitcast = ptrtoint i64* %sp0 to i64
+ %add = add i64 %bitcast, 1
+ %addr = inttoptr i64 %add to i64*
+ %pix_sp0.0.copyload = load i64* %addr, align 1
+ %val = uitofp i64 %pix_sp0.0.copyload to double
+ %vmull.i = fmul double %val, %val
+ ret double %vmull.i
+
+}
+
+; ********* 1s. load with scaled imm to float. *********
+define float @sfct1(i8* nocapture %sp0) {
+; CHECK-LABEL: sfct1:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: sshll.8h [[SEXTREG1:v[0-9]+]], v[[REGNUM]], #0
+; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
+; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+ %addr = getelementptr i8* %sp0, i64 1
+ %pix_sp0.0.copyload = load i8* %addr, align 1
+ %val = sitofp i8 %pix_sp0.0.copyload to float
+ %vmull.i = fmul float %val, %val
+ ret float %vmull.i
+}
+
+define float @sfct2(i16* nocapture %sp0) {
+; CHECK-LABEL: sfct2:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, #2]
+; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], v[[REGNUM]], #0
+; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+ %addr = getelementptr i16* %sp0, i64 1
+ %pix_sp0.0.copyload = load i16* %addr, align 1
+ %val = sitofp i16 %pix_sp0.0.copyload to float
+ %vmull.i = fmul float %val, %val
+ ret float %vmull.i
+}
+
+define float @sfct3(i32* nocapture %sp0) {
+; CHECK-LABEL: sfct3:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, #4]
+; CHECK-NEXT: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+ %addr = getelementptr i32* %sp0, i64 1
+ %pix_sp0.0.copyload = load i32* %addr, align 1
+ %val = sitofp i32 %pix_sp0.0.copyload to float
+ %vmull.i = fmul float %val, %val
+ ret float %vmull.i
+}
+
+; i64 -> f32 is not supported on floating point unit.
+define float @sfct4(i64* nocapture %sp0) {
+; CHECK-LABEL: sfct4:
+; CHECK: ldr x[[REGNUM:[0-9]+]], [x0, #8]
+; CHECK-NEXT: scvtf [[REG:s[0-9]+]], x[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+ %addr = getelementptr i64* %sp0, i64 1
+ %pix_sp0.0.copyload = load i64* %addr, align 1
+ %val = sitofp i64 %pix_sp0.0.copyload to float
+ %vmull.i = fmul float %val, %val
+ ret float %vmull.i
+}
+
+; ********* 2s. load with scaled register to float. *********
+define float @sfct5(i8* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: sfct5:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, x1]
+; CHECK-NEXT: sshll.8h [[SEXTREG1:v[0-9]+]], v[[REGNUM]], #0
+; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
+; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+ %addr = getelementptr i8* %sp0, i64 %offset
+ %pix_sp0.0.copyload = load i8* %addr, align 1
+ %val = sitofp i8 %pix_sp0.0.copyload to float
+ %vmull.i = fmul float %val, %val
+ ret float %vmull.i
+}
+
+define float @sfct6(i16* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: sfct6:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, x1, lsl #1]
+; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], v[[REGNUM]], #0
+; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+ %addr = getelementptr i16* %sp0, i64 %offset
+ %pix_sp0.0.copyload = load i16* %addr, align 1
+ %val = sitofp i16 %pix_sp0.0.copyload to float
+ %vmull.i = fmul float %val, %val
+ ret float %vmull.i
+}
+
+define float @sfct7(i32* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: sfct7:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, x1, lsl #2]
+; CHECK-NEXT: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+ %addr = getelementptr i32* %sp0, i64 %offset
+ %pix_sp0.0.copyload = load i32* %addr, align 1
+ %val = sitofp i32 %pix_sp0.0.copyload to float
+ %vmull.i = fmul float %val, %val
+ ret float %vmull.i
+}
+
+; i64 -> f32 is not supported on floating point unit.
+define float @sfct8(i64* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: sfct8:
+; CHECK: ldr x[[REGNUM:[0-9]+]], [x0, x1, lsl #3]
+; CHECK-NEXT: scvtf [[REG:s[0-9]+]], x[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+ %addr = getelementptr i64* %sp0, i64 %offset
+ %pix_sp0.0.copyload = load i64* %addr, align 1
+ %val = sitofp i64 %pix_sp0.0.copyload to float
+ %vmull.i = fmul float %val, %val
+ ret float %vmull.i
+}
+
+; ********* 3s. load with scaled imm to double. *********
+define double @sfct9(i8* nocapture %sp0) {
+; CHECK-LABEL: sfct9:
+; CHECK: ldrsb w[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: scvtf [[REG:d[0-9]+]], w[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+ %addr = getelementptr i8* %sp0, i64 1
+ %pix_sp0.0.copyload = load i8* %addr, align 1
+ %val = sitofp i8 %pix_sp0.0.copyload to double
+ %vmull.i = fmul double %val, %val
+ ret double %vmull.i
+}
+
+define double @sfct10(i16* nocapture %sp0) {
+; CHECK-LABEL: sfct10:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, #2]
+; CHECK-NEXT: sshll.4s [[SEXTREG1:v[0-9]+]], v[[REGNUM]], #0
+; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
+; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+ %addr = getelementptr i16* %sp0, i64 1
+ %pix_sp0.0.copyload = load i16* %addr, align 1
+ %val = sitofp i16 %pix_sp0.0.copyload to double
+ %vmull.i = fmul double %val, %val
+ ret double %vmull.i
+}
+
+define double @sfct11(i32* nocapture %sp0) {
+; CHECK-LABEL: sfct11:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, #4]
+; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], v[[REGNUM]], #0
+; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+ %addr = getelementptr i32* %sp0, i64 1
+ %pix_sp0.0.copyload = load i32* %addr, align 1
+ %val = sitofp i32 %pix_sp0.0.copyload to double
+ %vmull.i = fmul double %val, %val
+ ret double %vmull.i
+}
+
+define double @sfct12(i64* nocapture %sp0) {
+; CHECK-LABEL: sfct12:
+; CHECK: ldr d[[REGNUM:[0-9]+]], [x0, #8]
+; CHECK-NEXT: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+ %addr = getelementptr i64* %sp0, i64 1
+ %pix_sp0.0.copyload = load i64* %addr, align 1
+ %val = sitofp i64 %pix_sp0.0.copyload to double
+ %vmull.i = fmul double %val, %val
+ ret double %vmull.i
+}
+
+; ********* 4s. load with scaled register to double. *********
+define double @sfct13(i8* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: sfct13:
+; CHECK: ldrsb w[[REGNUM:[0-9]+]], [x0, x1]
+; CHECK-NEXT: scvtf [[REG:d[0-9]+]], w[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+ %addr = getelementptr i8* %sp0, i64 %offset
+ %pix_sp0.0.copyload = load i8* %addr, align 1
+ %val = sitofp i8 %pix_sp0.0.copyload to double
+ %vmull.i = fmul double %val, %val
+ ret double %vmull.i
+}
+
+define double @sfct14(i16* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: sfct14:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, x1, lsl #1]
+; CHECK-NEXT: sshll.4s [[SEXTREG1:v[0-9]+]], v[[REGNUM]], #0
+; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
+; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+ %addr = getelementptr i16* %sp0, i64 %offset
+ %pix_sp0.0.copyload = load i16* %addr, align 1
+ %val = sitofp i16 %pix_sp0.0.copyload to double
+ %vmull.i = fmul double %val, %val
+ ret double %vmull.i
+}
+
+define double @sfct15(i32* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: sfct15:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, x1, lsl #2]
+; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], v[[REGNUM]], #0
+; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+ %addr = getelementptr i32* %sp0, i64 %offset
+ %pix_sp0.0.copyload = load i32* %addr, align 1
+ %val = sitofp i32 %pix_sp0.0.copyload to double
+ %vmull.i = fmul double %val, %val
+ ret double %vmull.i
+}
+
+define double @sfct16(i64* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: sfct16:
+; CHECK: ldr d[[REGNUM:[0-9]+]], [x0, x1, lsl #3]
+; CHECK-NEXT: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+ %addr = getelementptr i64* %sp0, i64 %offset
+ %pix_sp0.0.copyload = load i64* %addr, align 1
+ %val = sitofp i64 %pix_sp0.0.copyload to double
+ %vmull.i = fmul double %val, %val
+ ret double %vmull.i
+}
+
+; ********* 5s. load with unscaled imm to float. *********
+define float @sfct17(i8* nocapture %sp0) {
+entry:
+; CHECK-LABEL: sfct17:
+; CHECK: ldur b[[REGNUM:[0-9]+]], [x0, #-1]
+; CHECK-NEXT: sshll.8h [[SEXTREG1:v[0-9]+]], v[[REGNUM]], #0
+; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
+; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+ %bitcast = ptrtoint i8* %sp0 to i64
+ %add = add i64 %bitcast, -1
+ %addr = inttoptr i64 %add to i8*
+ %pix_sp0.0.copyload = load i8* %addr, align 1
+ %val = sitofp i8 %pix_sp0.0.copyload to float
+ %vmull.i = fmul float %val, %val
+ ret float %vmull.i
+}
+
+define float @sfct18(i16* nocapture %sp0) {
+; CHECK-LABEL: sfct18:
+; CHECK: ldur h[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], v[[REGNUM]], #0
+; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+ %bitcast = ptrtoint i16* %sp0 to i64
+ %add = add i64 %bitcast, 1
+ %addr = inttoptr i64 %add to i16*
+ %pix_sp0.0.copyload = load i16* %addr, align 1
+ %val = sitofp i16 %pix_sp0.0.copyload to float
+ %vmull.i = fmul float %val, %val
+ ret float %vmull.i
+}
+
+define float @sfct19(i32* nocapture %sp0) {
+; CHECK-LABEL: sfct19:
+; CHECK: ldur s[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+ %bitcast = ptrtoint i32* %sp0 to i64
+ %add = add i64 %bitcast, 1
+ %addr = inttoptr i64 %add to i32*
+ %pix_sp0.0.copyload = load i32* %addr, align 1
+ %val = sitofp i32 %pix_sp0.0.copyload to float
+ %vmull.i = fmul float %val, %val
+ ret float %vmull.i
+}
+
+; i64 -> f32 is not supported on floating point unit.
+define float @sfct20(i64* nocapture %sp0) {
+; CHECK-LABEL: sfct20:
+; CHECK: ldur x[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: scvtf [[REG:s[0-9]+]], x[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+ %bitcast = ptrtoint i64* %sp0 to i64
+ %add = add i64 %bitcast, 1
+ %addr = inttoptr i64 %add to i64*
+ %pix_sp0.0.copyload = load i64* %addr, align 1
+ %val = sitofp i64 %pix_sp0.0.copyload to float
+ %vmull.i = fmul float %val, %val
+ ret float %vmull.i
+
+}
+
+; ********* 6s. load with unscaled imm to double. *********
+define double @sfct21(i8* nocapture %sp0) {
+entry:
+; CHECK-LABEL: sfct21:
+; CHECK: ldursb w[[REGNUM:[0-9]+]], [x0, #-1]
+; CHECK-NEXT: scvtf [[REG:d[0-9]+]], w[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+ %bitcast = ptrtoint i8* %sp0 to i64
+ %add = add i64 %bitcast, -1
+ %addr = inttoptr i64 %add to i8*
+ %pix_sp0.0.copyload = load i8* %addr, align 1
+ %val = sitofp i8 %pix_sp0.0.copyload to double
+ %vmull.i = fmul double %val, %val
+ ret double %vmull.i
+}
+
+define double @sfct22(i16* nocapture %sp0) {
+; CHECK-LABEL: sfct22:
+; CHECK: ldur h[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: sshll.4s [[SEXTREG1:v[0-9]+]], v[[REGNUM]], #0
+; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
+; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+ %bitcast = ptrtoint i16* %sp0 to i64
+ %add = add i64 %bitcast, 1
+ %addr = inttoptr i64 %add to i16*
+ %pix_sp0.0.copyload = load i16* %addr, align 1
+ %val = sitofp i16 %pix_sp0.0.copyload to double
+ %vmull.i = fmul double %val, %val
+ ret double %vmull.i
+}
+
+define double @sfct23(i32* nocapture %sp0) {
+; CHECK-LABEL: sfct23:
+; CHECK: ldur s[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], v[[REGNUM]], #0
+; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+ %bitcast = ptrtoint i32* %sp0 to i64
+ %add = add i64 %bitcast, 1
+ %addr = inttoptr i64 %add to i32*
+ %pix_sp0.0.copyload = load i32* %addr, align 1
+ %val = sitofp i32 %pix_sp0.0.copyload to double
+ %vmull.i = fmul double %val, %val
+ ret double %vmull.i
+}
+
+define double @sfct24(i64* nocapture %sp0) {
+; CHECK-LABEL: sfct24:
+; CHECK: ldur d[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+ %bitcast = ptrtoint i64* %sp0 to i64
+ %add = add i64 %bitcast, 1
+ %addr = inttoptr i64 %add to i64*
+ %pix_sp0.0.copyload = load i64* %addr, align 1
+ %val = sitofp i64 %pix_sp0.0.copyload to double
+ %vmull.i = fmul double %val, %val
+ ret double %vmull.i
+
+}
+
+; Check that we do not use SSHLL code sequence when code size is a concern.
+define float @codesize_sfct17(i8* nocapture %sp0) optsize {
+entry:
+; CHECK-LABEL: codesize_sfct17:
+; CHECK: ldursb w[[REGNUM:[0-9]+]], [x0, #-1]
+; CHECK-NEXT: scvtf [[REG:s[0-9]+]], w[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+ %bitcast = ptrtoint i8* %sp0 to i64
+ %add = add i64 %bitcast, -1
+ %addr = inttoptr i64 %add to i8*
+ %pix_sp0.0.copyload = load i8* %addr, align 1
+ %val = sitofp i8 %pix_sp0.0.copyload to float
+ %vmull.i = fmul float %val, %val
+ ret float %vmull.i
+}
+
+define double @codesize_sfct11(i32* nocapture %sp0) minsize {
+; CHECK-LABEL: sfct11:
+; CHECK: ldr w[[REGNUM:[0-9]+]], [x0, #4]
+; CHECK-NEXT: scvtf [[REG:d[0-9]+]], w[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+ %addr = getelementptr i32* %sp0, i64 1
+ %pix_sp0.0.copyload = load i32* %addr, align 1
+ %val = sitofp i32 %pix_sp0.0.copyload to double
+ %vmull.i = fmul double %val, %val
+ ret double %vmull.i
+}
+
+; Adding fp128 custom lowering makes these a little fragile since we have to
+; return the correct mix of Legal/Expand from the custom method.
+;
+; rdar://problem/14991489
+
+define float @float_from_i128(i128 %in) {
+; CHECK-LABEL: float_from_i128:
+; CHECK: bl {{_?__floatuntisf}}
+ %conv = uitofp i128 %in to float
+ ret float %conv
+}
+
+define double @double_from_i128(i128 %in) {
+; CHECK-LABEL: double_from_i128:
+; CHECK: bl {{_?__floattidf}}
+ %conv = sitofp i128 %in to double
+ ret double %conv
+}
+
+define fp128 @fp128_from_i128(i128 %in) {
+; CHECK-LABEL: fp128_from_i128:
+; CHECK: bl {{_?__floatuntitf}}
+ %conv = uitofp i128 %in to fp128
+ ret fp128 %conv
+}
+
+define i128 @i128_from_float(float %in) {
+; CHECK-LABEL: i128_from_float
+; CHECK: bl {{_?__fixsfti}}
+ %conv = fptosi float %in to i128
+ ret i128 %conv
+}
+
+define i128 @i128_from_double(double %in) {
+; CHECK-LABEL: i128_from_double
+; CHECK: bl {{_?__fixunsdfti}}
+ %conv = fptoui double %in to i128
+ ret i128 %conv
+}
+
+define i128 @i128_from_fp128(fp128 %in) {
+; CHECK-LABEL: i128_from_fp128
+; CHECK: bl {{_?__fixtfti}}
+ %conv = fptosi fp128 %in to i128
+ ret i128 %conv
+}
+
diff --git a/test/CodeGen/AArch64/arm64-shifted-sext.ll b/test/CodeGen/AArch64/arm64-shifted-sext.ll
new file mode 100644
index 0000000..b7b4e5d
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-shifted-sext.ll
@@ -0,0 +1,277 @@
+; RUN: llc -march=arm64 -mtriple=arm64-apple-ios < %s | FileCheck %s
+;
+; <rdar://problem/13820218>
+
+define signext i16 @extendedLeftShiftcharToshortBy4(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftcharToshortBy4:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sbfiz w0, [[REG]], #4, #8
+ %inc = add i8 %a, 1
+ %conv1 = sext i8 %inc to i32
+ %shl = shl nsw i32 %conv1, 4
+ %conv2 = trunc i32 %shl to i16
+ ret i16 %conv2
+}
+
+define signext i16 @extendedRightShiftcharToshortBy4(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftcharToshortBy4:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sbfx w0, [[REG]], #4, #4
+ %inc = add i8 %a, 1
+ %conv1 = sext i8 %inc to i32
+ %shr4 = lshr i32 %conv1, 4
+ %conv2 = trunc i32 %shr4 to i16
+ ret i16 %conv2
+}
+
+define signext i16 @extendedLeftShiftcharToshortBy8(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftcharToshortBy8:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sbfiz w0, [[REG]], #8, #8
+ %inc = add i8 %a, 1
+ %conv1 = sext i8 %inc to i32
+ %shl = shl nsw i32 %conv1, 8
+ %conv2 = trunc i32 %shl to i16
+ ret i16 %conv2
+}
+
+define signext i16 @extendedRightShiftcharToshortBy8(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftcharToshortBy8:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sxtb [[REG]], [[REG]]
+; CHECK: asr w0, [[REG]], #8
+ %inc = add i8 %a, 1
+ %conv1 = sext i8 %inc to i32
+ %shr4 = lshr i32 %conv1, 8
+ %conv2 = trunc i32 %shr4 to i16
+ ret i16 %conv2
+}
+
+define i32 @extendedLeftShiftcharTointBy4(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftcharTointBy4:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sbfiz w0, [[REG]], #4, #8
+ %inc = add i8 %a, 1
+ %conv = sext i8 %inc to i32
+ %shl = shl nsw i32 %conv, 4
+ ret i32 %shl
+}
+
+define i32 @extendedRightShiftcharTointBy4(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftcharTointBy4:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sbfx w0, [[REG]], #4, #4
+ %inc = add i8 %a, 1
+ %conv = sext i8 %inc to i32
+ %shr = ashr i32 %conv, 4
+ ret i32 %shr
+}
+
+define i32 @extendedLeftShiftcharTointBy8(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftcharTointBy8:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sbfiz w0, [[REG]], #8, #8
+ %inc = add i8 %a, 1
+ %conv = sext i8 %inc to i32
+ %shl = shl nsw i32 %conv, 8
+ ret i32 %shl
+}
+
+define i32 @extendedRightShiftcharTointBy8(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftcharTointBy8:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sxtb [[REG]], [[REG]]
+; CHECK: asr w0, [[REG]], #8
+ %inc = add i8 %a, 1
+ %conv = sext i8 %inc to i32
+ %shr = ashr i32 %conv, 8
+ ret i32 %shr
+}
+
+define i64 @extendedLeftShiftcharToint64By4(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftcharToint64By4:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sbfiz x0, x[[REG]], #4, #8
+ %inc = add i8 %a, 1
+ %conv = sext i8 %inc to i64
+ %shl = shl nsw i64 %conv, 4
+ ret i64 %shl
+}
+
+define i64 @extendedRightShiftcharToint64By4(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftcharToint64By4:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sbfx x0, x[[REG]], #4, #4
+ %inc = add i8 %a, 1
+ %conv = sext i8 %inc to i64
+ %shr = ashr i64 %conv, 4
+ ret i64 %shr
+}
+
+define i64 @extendedLeftShiftcharToint64By8(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftcharToint64By8:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sbfiz x0, x[[REG]], #8, #8
+ %inc = add i8 %a, 1
+ %conv = sext i8 %inc to i64
+ %shl = shl nsw i64 %conv, 8
+ ret i64 %shl
+}
+
+define i64 @extendedRightShiftcharToint64By8(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftcharToint64By8:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sxtb x[[REG]], w[[REG]]
+; CHECK: asr x0, x[[REG]], #8
+ %inc = add i8 %a, 1
+ %conv = sext i8 %inc to i64
+ %shr = ashr i64 %conv, 8
+ ret i64 %shr
+}
+
+define i32 @extendedLeftShiftshortTointBy4(i16 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftshortTointBy4:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sbfiz w0, [[REG]], #4, #16
+ %inc = add i16 %a, 1
+ %conv = sext i16 %inc to i32
+ %shl = shl nsw i32 %conv, 4
+ ret i32 %shl
+}
+
+define i32 @extendedRightShiftshortTointBy4(i16 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftshortTointBy4:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sbfx w0, [[REG]], #4, #12
+ %inc = add i16 %a, 1
+ %conv = sext i16 %inc to i32
+ %shr = ashr i32 %conv, 4
+ ret i32 %shr
+}
+
+define i32 @extendedLeftShiftshortTointBy16(i16 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftshortTointBy16:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: lsl w0, [[REG]], #16
+ %inc = add i16 %a, 1
+ %conv2 = zext i16 %inc to i32
+ %shl = shl nuw i32 %conv2, 16
+ ret i32 %shl
+}
+
+define i32 @extendedRightShiftshortTointBy16(i16 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftshortTointBy16:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sxth [[REG]], [[REG]]
+; CHECK: asr w0, [[REG]], #16
+ %inc = add i16 %a, 1
+ %conv = sext i16 %inc to i32
+ %shr = ashr i32 %conv, 16
+ ret i32 %shr
+}
+
+define i64 @extendedLeftShiftshortToint64By4(i16 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftshortToint64By4:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sbfiz x0, x[[REG]], #4, #16
+ %inc = add i16 %a, 1
+ %conv = sext i16 %inc to i64
+ %shl = shl nsw i64 %conv, 4
+ ret i64 %shl
+}
+
+define i64 @extendedRightShiftshortToint64By4(i16 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftshortToint64By4:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sbfx x0, x[[REG]], #4, #12
+ %inc = add i16 %a, 1
+ %conv = sext i16 %inc to i64
+ %shr = ashr i64 %conv, 4
+ ret i64 %shr
+}
+
+define i64 @extendedLeftShiftshortToint64By16(i16 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftshortToint64By16:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sbfiz x0, x[[REG]], #16, #16
+ %inc = add i16 %a, 1
+ %conv = sext i16 %inc to i64
+ %shl = shl nsw i64 %conv, 16
+ ret i64 %shl
+}
+
+define i64 @extendedRightShiftshortToint64By16(i16 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftshortToint64By16:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sxth x[[REG]], w[[REG]]
+; CHECK: asr x0, x[[REG]], #16
+ %inc = add i16 %a, 1
+ %conv = sext i16 %inc to i64
+ %shr = ashr i64 %conv, 16
+ ret i64 %shr
+}
+
+define i64 @extendedLeftShiftintToint64By4(i32 %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftintToint64By4:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sbfiz x0, x[[REG]], #4, #32
+ %inc = add nsw i32 %a, 1
+ %conv = sext i32 %inc to i64
+ %shl = shl nsw i64 %conv, 4
+ ret i64 %shl
+}
+
+define i64 @extendedRightShiftintToint64By4(i32 %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftintToint64By4:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sbfx x0, x[[REG]], #4, #28
+ %inc = add nsw i32 %a, 1
+ %conv = sext i32 %inc to i64
+ %shr = ashr i64 %conv, 4
+ ret i64 %shr
+}
+
+define i64 @extendedLeftShiftintToint64By32(i32 %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftintToint64By32:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: lsl x0, x[[REG]], #32
+ %inc = add nsw i32 %a, 1
+ %conv2 = zext i32 %inc to i64
+ %shl = shl nuw i64 %conv2, 32
+ ret i64 %shl
+}
+
+define i64 @extendedRightShiftintToint64By32(i32 %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftintToint64By32:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sxtw x[[REG]], w[[REG]]
+; CHECK: asr x0, x[[REG]], #32
+ %inc = add nsw i32 %a, 1
+ %conv = sext i32 %inc to i64
+ %shr = ashr i64 %conv, 32
+ ret i64 %shr
+}
diff --git a/test/CodeGen/AArch64/arm64-simd-scalar-to-vector.ll b/test/CodeGen/AArch64/arm64-simd-scalar-to-vector.ll
new file mode 100644
index 0000000..aed39e7
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-simd-scalar-to-vector.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -mcpu=cyclone | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -O0 -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-FAST
+
+define <16 x i8> @foo(<16 x i8> %a) nounwind optsize readnone ssp {
+; CHECK: uaddlv.16b h0, v0
+; CHECK: rshrn.8b v0, v0, #4
+; CHECK: dup.16b v0, v0[0]
+; CHECK: ret
+
+; CHECK-FAST: uaddlv.16b
+; CHECK-FAST: rshrn.8b
+; CHECK-FAST: dup.16b
+ %tmp = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8> %a) nounwind
+ %tmp1 = trunc i32 %tmp to i16
+ %tmp2 = insertelement <8 x i16> undef, i16 %tmp1, i32 0
+ %tmp3 = tail call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %tmp2, i32 4)
+ %tmp4 = shufflevector <8 x i8> %tmp3, <8 x i8> undef, <16 x i32> zeroinitializer
+ ret <16 x i8> %tmp4
+}
+
+declare <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8>) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-simplest-elf.ll b/test/CodeGen/AArch64/arm64-simplest-elf.ll
new file mode 100644
index 0000000..1254365
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-simplest-elf.ll
@@ -0,0 +1,18 @@
+; RUN: llc -mtriple=arm64-linux-gnu < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu -filetype=obj < %s | llvm-objdump - -r -d --triple=arm64-linux-gnu | FileCheck --check-prefix=CHECK-ELF %s
+
+define void @foo() nounwind {
+ ret void
+}
+
+ ; Check source looks ELF-like: no leading underscore, comments with //
+; CHECK: foo: // @foo
+; CHECK: ret
+
+ ; Similarly make sure ELF output works and is vaguely sane: aarch64 target
+ ; machine with correct section & symbol names.
+; CHECK-ELF: file format ELF64-aarch64
+
+; CHECK-ELF: Disassembly of section .text
+; CHECK-ELF-LABEL: foo:
+; CHECK-ELF: ret
diff --git a/test/CodeGen/AArch64/arm64-sincos.ll b/test/CodeGen/AArch64/arm64-sincos.ll
new file mode 100644
index 0000000..06157b2
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-sincos.ll
@@ -0,0 +1,42 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7 | FileCheck %s --check-prefix CHECK-IOS
+; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s --check-prefix CHECK-LINUX
+
+; Combine sin / cos into a single call.
+; rdar://12856873
+
+define float @test1(float %x) nounwind {
+entry:
+; CHECK-IOS-LABEL: test1:
+; CHECK-IOS: bl ___sincosf_stret
+; CHECK-IOS: fadd s0, s0, s1
+
+; CHECK-LINUX-LABEL: test1:
+; CHECK-LINUX: bl sinf
+; CHECK-LINUX: bl cosf
+
+ %call = tail call float @sinf(float %x) nounwind readnone
+ %call1 = tail call float @cosf(float %x) nounwind readnone
+ %add = fadd float %call, %call1
+ ret float %add
+}
+
+define double @test2(double %x) nounwind {
+entry:
+; CHECK-IOS-LABEL: test2:
+; CHECK-IOS: bl ___sincos_stret
+; CHECK-IOS: fadd d0, d0, d1
+
+; CHECK-LINUX-LABEL: test2:
+; CHECK-LINUX: bl sin
+; CHECK-LINUX: bl cos
+
+ %call = tail call double @sin(double %x) nounwind readnone
+ %call1 = tail call double @cos(double %x) nounwind readnone
+ %add = fadd double %call, %call1
+ ret double %add
+}
+
+declare float @sinf(float) readonly
+declare double @sin(double) readonly
+declare float @cosf(float) readonly
+declare double @cos(double) readonly
diff --git a/test/CodeGen/AArch64/arm64-sitofp-combine-chains.ll b/test/CodeGen/AArch64/arm64-sitofp-combine-chains.ll
new file mode 100644
index 0000000..10b433b
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-sitofp-combine-chains.ll
@@ -0,0 +1,22 @@
+; RUN: llc -march=arm64 -o - %s | FileCheck %s
+
+; ARM64ISelLowering.cpp was creating a new (floating-point) load for efficiency
+; but not updating chain-successors of the old one. As a result, the two memory
+; operations in this function both ended up direct successors to the EntryToken
+; and could be reordered.
+
+@var = global i32 0, align 4
+
+define float @foo() {
+; CHECK-LABEL: foo:
+ ; Load must come before we clobber @var
+; CHECK: adrp x[[VARBASE:[0-9]+]], {{_?var}}
+; CHECK: ldr [[SREG:s[0-9]+]], [x[[VARBASE]],
+; CHECK: str wzr, [x[[VARBASE]],
+
+ %val = load i32* @var, align 4
+ store i32 0, i32* @var, align 4
+
+ %fltval = sitofp i32 %val to float
+ ret float %fltval
+}
diff --git a/test/CodeGen/AArch64/arm64-sli-sri-opt.ll b/test/CodeGen/AArch64/arm64-sli-sri-opt.ll
new file mode 100644
index 0000000..7fec539
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-sli-sri-opt.ll
@@ -0,0 +1,41 @@
+; RUN: llc -aarch64-shift-insert-generation=true -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s
+
+define void @testLeftGood(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
+; CHECK-LABEL: testLeftGood:
+; CHECK: sli.16b v0, v1, #3
+ %and.i = and <16 x i8> %src1, <i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252>
+ %vshl_n = shl <16 x i8> %src2, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+ %result = or <16 x i8> %and.i, %vshl_n
+ store <16 x i8> %result, <16 x i8>* %dest, align 16
+ ret void
+}
+
+define void @testLeftBad(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
+; CHECK-LABEL: testLeftBad:
+; CHECK-NOT: sli
+ %and.i = and <16 x i8> %src1, <i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165>
+ %vshl_n = shl <16 x i8> %src2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ %result = or <16 x i8> %and.i, %vshl_n
+ store <16 x i8> %result, <16 x i8>* %dest, align 16
+ ret void
+}
+
+define void @testRightGood(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
+; CHECK-LABEL: testRightGood:
+; CHECK: sri.16b v0, v1, #3
+ %and.i = and <16 x i8> %src1, <i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252>
+ %vshl_n = lshr <16 x i8> %src2, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+ %result = or <16 x i8> %and.i, %vshl_n
+ store <16 x i8> %result, <16 x i8>* %dest, align 16
+ ret void
+}
+
+define void @testRightBad(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
+; CHECK-LABEL: testRightBad:
+; CHECK-NOT: sri
+ %and.i = and <16 x i8> %src1, <i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165>
+ %vshl_n = lshr <16 x i8> %src2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ %result = or <16 x i8> %and.i, %vshl_n
+ store <16 x i8> %result, <16 x i8>* %dest, align 16
+ ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-smaxv.ll b/test/CodeGen/AArch64/arm64-smaxv.ll
new file mode 100644
index 0000000..183e667
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-smaxv.ll
@@ -0,0 +1,74 @@
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s
+
+define signext i8 @test_vmaxv_s8(<8 x i8> %a1) {
+; CHECK: test_vmaxv_s8
+; CHECK: smaxv.8b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+ %vmaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v8i8(<8 x i8> %a1)
+ %0 = trunc i32 %vmaxv.i to i8
+ ret i8 %0
+}
+
+define signext i16 @test_vmaxv_s16(<4 x i16> %a1) {
+; CHECK: test_vmaxv_s16
+; CHECK: smaxv.4h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+ %vmaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v4i16(<4 x i16> %a1)
+ %0 = trunc i32 %vmaxv.i to i16
+ ret i16 %0
+}
+
+define i32 @test_vmaxv_s32(<2 x i32> %a1) {
+; CHECK: test_vmaxv_s32
+; 2 x i32 is not supported by the ISA, thus, this is a special case
+; CHECK: smaxp.2s v[[REGNUM:[0-9]+]], v0, v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+ %vmaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v2i32(<2 x i32> %a1)
+ ret i32 %vmaxv.i
+}
+
+define signext i8 @test_vmaxvq_s8(<16 x i8> %a1) {
+; CHECK: test_vmaxvq_s8
+; CHECK: smaxv.16b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+ %vmaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v16i8(<16 x i8> %a1)
+ %0 = trunc i32 %vmaxv.i to i8
+ ret i8 %0
+}
+
+define signext i16 @test_vmaxvq_s16(<8 x i16> %a1) {
+; CHECK: test_vmaxvq_s16
+; CHECK: smaxv.8h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+ %vmaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v8i16(<8 x i16> %a1)
+ %0 = trunc i32 %vmaxv.i to i16
+ ret i16 %0
+}
+
+define i32 @test_vmaxvq_s32(<4 x i32> %a1) {
+; CHECK: test_vmaxvq_s32
+; CHECK: smaxv.4s [[REGNUM:s[0-9]+]], v0
+; CHECK-NEXT: fmov w0, [[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+ %vmaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v4i32(<4 x i32> %a1)
+ ret i32 %vmaxv.i
+}
+
+declare i32 @llvm.aarch64.neon.smaxv.i32.v4i32(<4 x i32>)
+declare i32 @llvm.aarch64.neon.smaxv.i32.v8i16(<8 x i16>)
+declare i32 @llvm.aarch64.neon.smaxv.i32.v16i8(<16 x i8>)
+declare i32 @llvm.aarch64.neon.smaxv.i32.v2i32(<2 x i32>)
+declare i32 @llvm.aarch64.neon.smaxv.i32.v4i16(<4 x i16>)
+declare i32 @llvm.aarch64.neon.smaxv.i32.v8i8(<8 x i8>)
+
diff --git a/test/CodeGen/AArch64/arm64-sminv.ll b/test/CodeGen/AArch64/arm64-sminv.ll
new file mode 100644
index 0000000..195c4e5
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-sminv.ll
@@ -0,0 +1,74 @@
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s
+
+define signext i8 @test_vminv_s8(<8 x i8> %a1) {
+; CHECK: test_vminv_s8
+; CHECK: sminv.8b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+ %vminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v8i8(<8 x i8> %a1)
+ %0 = trunc i32 %vminv.i to i8
+ ret i8 %0
+}
+
+define signext i16 @test_vminv_s16(<4 x i16> %a1) {
+; CHECK: test_vminv_s16
+; CHECK: sminv.4h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+ %vminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v4i16(<4 x i16> %a1)
+ %0 = trunc i32 %vminv.i to i16
+ ret i16 %0
+}
+
+define i32 @test_vminv_s32(<2 x i32> %a1) {
+; CHECK: test_vminv_s32
+; 2 x i32 is not supported by the ISA, thus, this is a special case
+; CHECK: sminp.2s v[[REGNUM:[0-9]+]], v0, v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+ %vminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v2i32(<2 x i32> %a1)
+ ret i32 %vminv.i
+}
+
+define signext i8 @test_vminvq_s8(<16 x i8> %a1) {
+; CHECK: test_vminvq_s8
+; CHECK: sminv.16b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+ %vminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v16i8(<16 x i8> %a1)
+ %0 = trunc i32 %vminv.i to i8
+ ret i8 %0
+}
+
+define signext i16 @test_vminvq_s16(<8 x i16> %a1) {
+; CHECK: test_vminvq_s16
+; CHECK: sminv.8h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+ %vminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v8i16(<8 x i16> %a1)
+ %0 = trunc i32 %vminv.i to i16
+ ret i16 %0
+}
+
+define i32 @test_vminvq_s32(<4 x i32> %a1) {
+; CHECK: test_vminvq_s32
+; CHECK: sminv.4s [[REGNUM:s[0-9]+]], v0
+; CHECK-NEXT: fmov w0, [[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+ %vminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v4i32(<4 x i32> %a1)
+ ret i32 %vminv.i
+}
+
+declare i32 @llvm.aarch64.neon.sminv.i32.v4i32(<4 x i32>)
+declare i32 @llvm.aarch64.neon.sminv.i32.v8i16(<8 x i16>)
+declare i32 @llvm.aarch64.neon.sminv.i32.v16i8(<16 x i8>)
+declare i32 @llvm.aarch64.neon.sminv.i32.v2i32(<2 x i32>)
+declare i32 @llvm.aarch64.neon.sminv.i32.v4i16(<4 x i16>)
+declare i32 @llvm.aarch64.neon.sminv.i32.v8i8(<8 x i8>)
+
diff --git a/test/CodeGen/AArch64/arm64-spill-lr.ll b/test/CodeGen/AArch64/arm64-spill-lr.ll
new file mode 100644
index 0000000..fb6588e
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-spill-lr.ll
@@ -0,0 +1,74 @@
+; RUN: llc -mtriple=arm64-apple-ios < %s
+@bar = common global i32 0, align 4
+
+; Leaf function which uses all callee-saved registers and allocates >= 256 bytes on the stack
+; this will cause processFunctionBeforeCalleeSavedScan() to spill LR as an additional scratch
+; register.
+;
+; This is a crash-only regression test for rdar://15124582.
+define i32 @foo(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h) nounwind {
+entry:
+ %stack = alloca [128 x i32], align 4
+ %0 = bitcast [128 x i32]* %stack to i8*
+ %idxprom = sext i32 %a to i64
+ %arrayidx = getelementptr inbounds [128 x i32]* %stack, i64 0, i64 %idxprom
+ store i32 %b, i32* %arrayidx, align 4
+ %1 = load volatile i32* @bar, align 4
+ %2 = load volatile i32* @bar, align 4
+ %3 = load volatile i32* @bar, align 4
+ %4 = load volatile i32* @bar, align 4
+ %5 = load volatile i32* @bar, align 4
+ %6 = load volatile i32* @bar, align 4
+ %7 = load volatile i32* @bar, align 4
+ %8 = load volatile i32* @bar, align 4
+ %9 = load volatile i32* @bar, align 4
+ %10 = load volatile i32* @bar, align 4
+ %11 = load volatile i32* @bar, align 4
+ %12 = load volatile i32* @bar, align 4
+ %13 = load volatile i32* @bar, align 4
+ %14 = load volatile i32* @bar, align 4
+ %15 = load volatile i32* @bar, align 4
+ %16 = load volatile i32* @bar, align 4
+ %17 = load volatile i32* @bar, align 4
+ %18 = load volatile i32* @bar, align 4
+ %19 = load volatile i32* @bar, align 4
+ %20 = load volatile i32* @bar, align 4
+ %idxprom1 = sext i32 %c to i64
+ %arrayidx2 = getelementptr inbounds [128 x i32]* %stack, i64 0, i64 %idxprom1
+ %21 = load i32* %arrayidx2, align 4
+ %factor = mul i32 %h, -2
+ %factor67 = mul i32 %g, -2
+ %factor68 = mul i32 %f, -2
+ %factor69 = mul i32 %e, -2
+ %factor70 = mul i32 %d, -2
+ %factor71 = mul i32 %c, -2
+ %factor72 = mul i32 %b, -2
+ %sum = add i32 %2, %1
+ %sum73 = add i32 %sum, %3
+ %sum74 = add i32 %sum73, %4
+ %sum75 = add i32 %sum74, %5
+ %sum76 = add i32 %sum75, %6
+ %sum77 = add i32 %sum76, %7
+ %sum78 = add i32 %sum77, %8
+ %sum79 = add i32 %sum78, %9
+ %sum80 = add i32 %sum79, %10
+ %sum81 = add i32 %sum80, %11
+ %sum82 = add i32 %sum81, %12
+ %sum83 = add i32 %sum82, %13
+ %sum84 = add i32 %sum83, %14
+ %sum85 = add i32 %sum84, %15
+ %sum86 = add i32 %sum85, %16
+ %sum87 = add i32 %sum86, %17
+ %sum88 = add i32 %sum87, %18
+ %sum89 = add i32 %sum88, %19
+ %sum90 = add i32 %sum89, %20
+ %sub15 = sub i32 %21, %sum90
+ %sub16 = add i32 %sub15, %factor
+ %sub17 = add i32 %sub16, %factor67
+ %sub18 = add i32 %sub17, %factor68
+ %sub19 = add i32 %sub18, %factor69
+ %sub20 = add i32 %sub19, %factor70
+ %sub21 = add i32 %sub20, %factor71
+ %add = add i32 %sub21, %factor72
+ ret i32 %add
+}
diff --git a/test/CodeGen/AArch64/arm64-spill.ll b/test/CodeGen/AArch64/arm64-spill.ll
new file mode 100644
index 0000000..47cdc2b
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-spill.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -aarch64-neon-syntax=apple -verify-machineinstrs
+
+; CHECK: fpr128
+; CHECK: ld1.2d
+; CHECK: str q
+; CHECK: inlineasm
+; CHECK: ldr q
+; CHECK: st1.2d
+define void @fpr128(<4 x float>* %p) nounwind ssp {
+entry:
+ %x = load <4 x float>* %p, align 16
+ call void asm sideeffect "; inlineasm", "~{q0},~{q1},~{q2},~{q3},~{q4},~{q5},~{q6},~{q7},~{q8},~{q9},~{q10},~{q11},~{q12},~{q13},~{q14},~{q15},~{q16},~{q17},~{q18},~{q19},~{q20},~{q21},~{q22},~{q23},~{q24},~{q25},~{q26},~{q27},~{q28},~{q29},~{q30},~{q31},~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{fp},~{lr},~{sp},~{memory}"() nounwind
+ store <4 x float> %x, <4 x float>* %p, align 16
+ ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-st1.ll b/test/CodeGen/AArch64/arm64-st1.ll
new file mode 100644
index 0000000..4370484
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-st1.ll
@@ -0,0 +1,676 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
+
+define void @st1lane_16b(<16 x i8> %A, i8* %D) {
+; CHECK-LABEL: st1lane_16b
+; CHECK: st1.b
+ %tmp = extractelement <16 x i8> %A, i32 1
+ store i8 %tmp, i8* %D
+ ret void
+}
+
+define void @st1lane_8h(<8 x i16> %A, i16* %D) {
+; CHECK-LABEL: st1lane_8h
+; CHECK: st1.h
+ %tmp = extractelement <8 x i16> %A, i32 1
+ store i16 %tmp, i16* %D
+ ret void
+}
+
+define void @st1lane_4s(<4 x i32> %A, i32* %D) {
+; CHECK-LABEL: st1lane_4s
+; CHECK: st1.s
+ %tmp = extractelement <4 x i32> %A, i32 1
+ store i32 %tmp, i32* %D
+ ret void
+}
+
+define void @st1lane_4s_float(<4 x float> %A, float* %D) {
+; CHECK-LABEL: st1lane_4s_float
+; CHECK: st1.s
+ %tmp = extractelement <4 x float> %A, i32 1
+ store float %tmp, float* %D
+ ret void
+}
+
+define void @st1lane_2d(<2 x i64> %A, i64* %D) {
+; CHECK-LABEL: st1lane_2d
+; CHECK: st1.d
+ %tmp = extractelement <2 x i64> %A, i32 1
+ store i64 %tmp, i64* %D
+ ret void
+}
+
+define void @st1lane_2d_double(<2 x double> %A, double* %D) {
+; CHECK-LABEL: st1lane_2d_double
+; CHECK: st1.d
+ %tmp = extractelement <2 x double> %A, i32 1
+ store double %tmp, double* %D
+ ret void
+}
+
+define void @st1lane_8b(<8 x i8> %A, i8* %D) {
+; CHECK-LABEL: st1lane_8b
+; CHECK: st1.b
+ %tmp = extractelement <8 x i8> %A, i32 1
+ store i8 %tmp, i8* %D
+ ret void
+}
+
+define void @st1lane_4h(<4 x i16> %A, i16* %D) {
+; CHECK-LABEL: st1lane_4h
+; CHECK: st1.h
+ %tmp = extractelement <4 x i16> %A, i32 1
+ store i16 %tmp, i16* %D
+ ret void
+}
+
+define void @st1lane_2s(<2 x i32> %A, i32* %D) {
+; CHECK-LABEL: st1lane_2s
+; CHECK: st1.s
+ %tmp = extractelement <2 x i32> %A, i32 1
+ store i32 %tmp, i32* %D
+ ret void
+}
+
+define void @st1lane_2s_float(<2 x float> %A, float* %D) {
+; CHECK-LABEL: st1lane_2s_float
+; CHECK: st1.s
+ %tmp = extractelement <2 x float> %A, i32 1
+ store float %tmp, float* %D
+ ret void
+}
+
+define void @st2lane_16b(<16 x i8> %A, <16 x i8> %B, i8* %D) {
+; CHECK-LABEL: st2lane_16b
+; CHECK: st2.b
+ call void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, i64 1, i8* %D)
+ ret void
+}
+
+define void @st2lane_8h(<8 x i16> %A, <8 x i16> %B, i16* %D) {
+; CHECK-LABEL: st2lane_8h
+; CHECK: st2.h
+ call void @llvm.aarch64.neon.st2lane.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, i64 1, i16* %D)
+ ret void
+}
+
+define void @st2lane_4s(<4 x i32> %A, <4 x i32> %B, i32* %D) {
+; CHECK-LABEL: st2lane_4s
+; CHECK: st2.s
+ call void @llvm.aarch64.neon.st2lane.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, i64 1, i32* %D)
+ ret void
+}
+
+define void @st2lane_2d(<2 x i64> %A, <2 x i64> %B, i64* %D) {
+; CHECK-LABEL: st2lane_2d
+; CHECK: st2.d
+ call void @llvm.aarch64.neon.st2lane.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, i64 1, i64* %D)
+ ret void
+}
+
+declare void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*) nounwind readnone
+declare void @llvm.aarch64.neon.st2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*) nounwind readnone
+declare void @llvm.aarch64.neon.st2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*) nounwind readnone
+declare void @llvm.aarch64.neon.st2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*) nounwind readnone
+
+define void @st3lane_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %D) {
+; CHECK-LABEL: st3lane_16b
+; CHECK: st3.b
+ call void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i64 1, i8* %D)
+ ret void
+}
+
+define void @st3lane_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %D) {
+; CHECK-LABEL: st3lane_8h
+; CHECK: st3.h
+ call void @llvm.aarch64.neon.st3lane.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i64 1, i16* %D)
+ ret void
+}
+
+define void @st3lane_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %D) {
+; CHECK-LABEL: st3lane_4s
+; CHECK: st3.s
+ call void @llvm.aarch64.neon.st3lane.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i64 1, i32* %D)
+ ret void
+}
+
+define void @st3lane_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %D) {
+; CHECK-LABEL: st3lane_2d
+; CHECK: st3.d
+ call void @llvm.aarch64.neon.st3lane.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64 1, i64* %D)
+ ret void
+}
+
+declare void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readnone
+declare void @llvm.aarch64.neon.st3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readnone
+declare void @llvm.aarch64.neon.st3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readnone
+declare void @llvm.aarch64.neon.st3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readnone
+
+define void @st4lane_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %E) {
+; CHECK-LABEL: st4lane_16b
+; CHECK: st4.b
+ call void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 1, i8* %E)
+ ret void
+}
+
+define void @st4lane_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %E) {
+; CHECK-LABEL: st4lane_8h
+; CHECK: st4.h
+ call void @llvm.aarch64.neon.st4lane.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 1, i16* %E)
+ ret void
+}
+
+define void @st4lane_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %E) {
+; CHECK-LABEL: st4lane_4s
+; CHECK: st4.s
+ call void @llvm.aarch64.neon.st4lane.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 1, i32* %E)
+ ret void
+}
+
+define void @st4lane_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %E) {
+; CHECK-LABEL: st4lane_2d
+; CHECK: st4.d
+ call void @llvm.aarch64.neon.st4lane.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 1, i64* %E)
+ ret void
+}
+
+declare void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readnone
+declare void @llvm.aarch64.neon.st4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readnone
+declare void @llvm.aarch64.neon.st4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readnone
+declare void @llvm.aarch64.neon.st4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readnone
+
+
+define void @st2_8b(<8 x i8> %A, <8 x i8> %B, i8* %P) nounwind {
+; CHECK-LABEL: st2_8b
+; CHECK st2.8b
+ call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, i8* %P)
+ ret void
+}
+
+define void @st3_8b(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %P) nounwind {
+; CHECK-LABEL: st3_8b
+; CHECK st3.8b
+ call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %P)
+ ret void
+}
+
+define void @st4_8b(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %P) nounwind {
+; CHECK-LABEL: st4_8b
+; CHECK st4.8b
+ call void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %P)
+ ret void
+}
+
+declare void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
+
+define void @st2_16b(<16 x i8> %A, <16 x i8> %B, i8* %P) nounwind {
+; CHECK-LABEL: st2_16b
+; CHECK st2.16b
+ call void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, i8* %P)
+ ret void
+}
+
+define void @st3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %P) nounwind {
+; CHECK-LABEL: st3_16b
+; CHECK st3.16b
+ call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %P)
+ ret void
+}
+
+define void @st4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %P) nounwind {
+; CHECK-LABEL: st4_16b
+; CHECK st4.16b
+ call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %P)
+ ret void
+}
+
+declare void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
+
+define void @st2_4h(<4 x i16> %A, <4 x i16> %B, i16* %P) nounwind {
+; CHECK-LABEL: st2_4h
+; CHECK st2.4h
+ call void @llvm.aarch64.neon.st2.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, i16* %P)
+ ret void
+}
+
+define void @st3_4h(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %P) nounwind {
+; CHECK-LABEL: st3_4h
+; CHECK st3.4h
+ call void @llvm.aarch64.neon.st3.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %P)
+ ret void
+}
+
+define void @st4_4h(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %P) nounwind {
+; CHECK-LABEL: st4_4h
+; CHECK st4.4h
+ call void @llvm.aarch64.neon.st4.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %P)
+ ret void
+}
+
+declare void @llvm.aarch64.neon.st2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
+
+define void @st2_8h(<8 x i16> %A, <8 x i16> %B, i16* %P) nounwind {
+; CHECK-LABEL: st2_8h
+; CHECK st2.8h
+ call void @llvm.aarch64.neon.st2.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, i16* %P)
+ ret void
+}
+
+define void @st3_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %P) nounwind {
+; CHECK-LABEL: st3_8h
+; CHECK st3.8h
+ call void @llvm.aarch64.neon.st3.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %P)
+ ret void
+}
+
+define void @st4_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %P) nounwind {
+; CHECK-LABEL: st4_8h
+; CHECK st4.8h
+ call void @llvm.aarch64.neon.st4.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %P)
+ ret void
+}
+
+declare void @llvm.aarch64.neon.st2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
+
+define void @st2_2s(<2 x i32> %A, <2 x i32> %B, i32* %P) nounwind {
+; CHECK-LABEL: st2_2s
+; CHECK st2.2s
+ call void @llvm.aarch64.neon.st2.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, i32* %P)
+ ret void
+}
+
+define void @st3_2s(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %P) nounwind {
+; CHECK-LABEL: st3_2s
+; CHECK st3.2s
+ call void @llvm.aarch64.neon.st3.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %P)
+ ret void
+}
+
+define void @st4_2s(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %P) nounwind {
+; CHECK-LABEL: st4_2s
+; CHECK st4.2s
+ call void @llvm.aarch64.neon.st4.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %P)
+ ret void
+}
+
+declare void @llvm.aarch64.neon.st2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
+
+define void @st2_4s(<4 x i32> %A, <4 x i32> %B, i32* %P) nounwind {
+; CHECK-LABEL: st2_4s
+; CHECK st2.4s
+ call void @llvm.aarch64.neon.st2.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, i32* %P)
+ ret void
+}
+
+define void @st3_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %P) nounwind {
+; CHECK-LABEL: st3_4s
+; CHECK st3.4s
+ call void @llvm.aarch64.neon.st3.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %P)
+ ret void
+}
+
+define void @st4_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %P) nounwind {
+; CHECK-LABEL: st4_4s
+; CHECK st4.4s
+ call void @llvm.aarch64.neon.st4.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %P)
+ ret void
+}
+
+declare void @llvm.aarch64.neon.st2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
+
+define void @st2_1d(<1 x i64> %A, <1 x i64> %B, i64* %P) nounwind {
+; CHECK-LABEL: st2_1d
+; CHECK st1.2d
+ call void @llvm.aarch64.neon.st2.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, i64* %P)
+ ret void
+}
+
+define void @st3_1d(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %P) nounwind {
+; CHECK-LABEL: st3_1d
+; CHECK st1.3d
+ call void @llvm.aarch64.neon.st3.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %P)
+ ret void
+}
+
+define void @st4_1d(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %P) nounwind {
+; CHECK-LABEL: st4_1d
+; CHECK st1.4d
+ call void @llvm.aarch64.neon.st4.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %P)
+ ret void
+}
+
+declare void @llvm.aarch64.neon.st2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
+
+define void @st2_2d(<2 x i64> %A, <2 x i64> %B, i64* %P) nounwind {
+; CHECK-LABEL: st2_2d
+; CHECK st2.2d
+ call void @llvm.aarch64.neon.st2.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, i64* %P)
+ ret void
+}
+
+define void @st3_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %P) nounwind {
+; CHECK-LABEL: st3_2d
+; CHECK st2.3d
+ call void @llvm.aarch64.neon.st3.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %P)
+ ret void
+}
+
+define void @st4_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %P) nounwind {
+; CHECK-LABEL: st4_2d
+; CHECK st2.4d
+ call void @llvm.aarch64.neon.st4.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %P)
+ ret void
+}
+
+declare void @llvm.aarch64.neon.st2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
+
+declare void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v2f32.p0f32(<2 x float>, <2 x float>, float*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v1f64.p0f64(<1 x double>, <1 x double>, double*) nounwind readonly
+
+define void @st1_x2_v8i8(<8 x i8> %A, <8 x i8> %B, i8* %addr) {
+; CHECK-LABEL: st1_x2_v8i8:
+; CHECK: st1.8b { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ call void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, i8* %addr)
+ ret void
+}
+
+define void @st1_x2_v4i16(<4 x i16> %A, <4 x i16> %B, i16* %addr) {
+; CHECK-LABEL: st1_x2_v4i16:
+; CHECK: st1.4h { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, i16* %addr)
+ ret void
+}
+
+define void @st1_x2_v2i32(<2 x i32> %A, <2 x i32> %B, i32* %addr) {
+; CHECK-LABEL: st1_x2_v2i32:
+; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ call void @llvm.aarch64.neon.st1x2.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, i32* %addr)
+ ret void
+}
+
+define void @st1_x2_v2f32(<2 x float> %A, <2 x float> %B, float* %addr) {
+; CHECK-LABEL: st1_x2_v2f32:
+; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ call void @llvm.aarch64.neon.st1x2.v2f32.p0f32(<2 x float> %A, <2 x float> %B, float* %addr)
+ ret void
+}
+
+define void @st1_x2_v1i64(<1 x i64> %A, <1 x i64> %B, i64* %addr) {
+; CHECK-LABEL: st1_x2_v1i64:
+; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ call void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, i64* %addr)
+ ret void
+}
+
+define void @st1_x2_v1f64(<1 x double> %A, <1 x double> %B, double* %addr) {
+; CHECK-LABEL: st1_x2_v1f64:
+; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ call void @llvm.aarch64.neon.st1x2.v1f64.p0f64(<1 x double> %A, <1 x double> %B, double* %addr)
+ ret void
+}
+
+declare void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float>, <4 x float>, float*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v2f64.p0f64(<2 x double>, <2 x double>, double*) nounwind readonly
+
+define void @st1_x2_v16i8(<16 x i8> %A, <16 x i8> %B, i8* %addr) {
+; CHECK-LABEL: st1_x2_v16i8:
+; CHECK: st1.16b { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ call void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, i8* %addr)
+ ret void
+}
+
+define void @st1_x2_v8i16(<8 x i16> %A, <8 x i16> %B, i16* %addr) {
+; CHECK-LABEL: st1_x2_v8i16:
+; CHECK: st1.8h { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, i16* %addr)
+ ret void
+}
+
+define void @st1_x2_v4i32(<4 x i32> %A, <4 x i32> %B, i32* %addr) {
+; CHECK-LABEL: st1_x2_v4i32:
+; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ call void @llvm.aarch64.neon.st1x2.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, i32* %addr)
+ ret void
+}
+
+define void @st1_x2_v4f32(<4 x float> %A, <4 x float> %B, float* %addr) {
+; CHECK-LABEL: st1_x2_v4f32:
+; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ call void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float> %A, <4 x float> %B, float* %addr)
+ ret void
+}
+
+define void @st1_x2_v2i64(<2 x i64> %A, <2 x i64> %B, i64* %addr) {
+; CHECK-LABEL: st1_x2_v2i64:
+; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ call void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, i64* %addr)
+ ret void
+}
+
+define void @st1_x2_v2f64(<2 x double> %A, <2 x double> %B, double* %addr) {
+; CHECK-LABEL: st1_x2_v2f64:
+; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ call void @llvm.aarch64.neon.st1x2.v2f64.p0f64(<2 x double> %A, <2 x double> %B, double* %addr)
+ ret void
+}
+
+declare void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, float*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, double*) nounwind readonly
+
+define void @st1_x3_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %addr) {
+; CHECK-LABEL: st1_x3_v8i8:
+; CHECK: st1.8b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ call void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %addr)
+ ret void
+}
+
+define void @st1_x3_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %addr) {
+; CHECK-LABEL: st1_x3_v4i16:
+; CHECK: st1.4h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %addr)
+ ret void
+}
+
+define void @st1_x3_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %addr) {
+; CHECK-LABEL: st1_x3_v2i32:
+; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ call void @llvm.aarch64.neon.st1x3.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %addr)
+ ret void
+}
+
+define void @st1_x3_v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, float* %addr) {
+; CHECK-LABEL: st1_x3_v2f32:
+; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ call void @llvm.aarch64.neon.st1x3.v2f32.p0f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, float* %addr)
+ ret void
+}
+
+define void @st1_x3_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %addr) {
+; CHECK-LABEL: st1_x3_v1i64:
+; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ call void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %addr)
+ ret void
+}
+
+define void @st1_x3_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, double* %addr) {
+; CHECK-LABEL: st1_x3_v1f64:
+; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ call void @llvm.aarch64.neon.st1x3.v1f64.p0f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, double* %addr)
+ ret void
+}
+
+declare void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, float*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, double*) nounwind readonly
+
+define void @st1_x3_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %addr) {
+; CHECK-LABEL: st1_x3_v16i8:
+; CHECK: st1.16b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ call void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %addr)
+ ret void
+}
+
+define void @st1_x3_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %addr) {
+; CHECK-LABEL: st1_x3_v8i16:
+; CHECK: st1.8h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %addr)
+ ret void
+}
+
+define void @st1_x3_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %addr) {
+; CHECK-LABEL: st1_x3_v4i32:
+; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ call void @llvm.aarch64.neon.st1x3.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %addr)
+ ret void
+}
+
+define void @st1_x3_v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, float* %addr) {
+; CHECK-LABEL: st1_x3_v4f32:
+; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ call void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, float* %addr)
+ ret void
+}
+
+define void @st1_x3_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %addr) {
+; CHECK-LABEL: st1_x3_v2i64:
+; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ call void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %addr)
+ ret void
+}
+
+define void @st1_x3_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, double* %addr) {
+; CHECK-LABEL: st1_x3_v2f64:
+; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ call void @llvm.aarch64.neon.st1x3.v2f64.p0f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, double* %addr)
+ ret void
+}
+
+
+declare void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, float*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, double*) nounwind readonly
+
+define void @st1_x4_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %addr) {
+; CHECK-LABEL: st1_x4_v8i8:
+; CHECK: st1.8b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ call void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %addr)
+ ret void
+}
+
+define void @st1_x4_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %addr) {
+; CHECK-LABEL: st1_x4_v4i16:
+; CHECK: st1.4h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %addr)
+ ret void
+}
+
+define void @st1_x4_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %addr) {
+; CHECK-LABEL: st1_x4_v2i32:
+; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ call void @llvm.aarch64.neon.st1x4.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %addr)
+ ret void
+}
+
+define void @st1_x4_v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, <2 x float> %D, float* %addr) {
+; CHECK-LABEL: st1_x4_v2f32:
+; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ call void @llvm.aarch64.neon.st1x4.v2f32.p0f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, <2 x float> %D, float* %addr)
+ ret void
+}
+
+define void @st1_x4_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %addr) {
+; CHECK-LABEL: st1_x4_v1i64:
+; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ call void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %addr)
+ ret void
+}
+
+define void @st1_x4_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x double> %D, double* %addr) {
+; CHECK-LABEL: st1_x4_v1f64:
+; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ call void @llvm.aarch64.neon.st1x4.v1f64.p0f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x double> %D, double* %addr)
+ ret void
+}
+
+declare void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, float*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, <2 x double>, double*) nounwind readonly
+
+define void @st1_x4_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %addr) {
+; CHECK-LABEL: st1_x4_v16i8:
+; CHECK: st1.16b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ call void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %addr)
+ ret void
+}
+
+define void @st1_x4_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %addr) {
+; CHECK-LABEL: st1_x4_v8i16:
+; CHECK: st1.8h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %addr)
+ ret void
+}
+
+define void @st1_x4_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %addr) {
+; CHECK-LABEL: st1_x4_v4i32:
+; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ call void @llvm.aarch64.neon.st1x4.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %addr)
+ ret void
+}
+
+define void @st1_x4_v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, <4 x float> %D, float* %addr) {
+; CHECK-LABEL: st1_x4_v4f32:
+; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ call void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, <4 x float> %D, float* %addr)
+ ret void
+}
+
+define void @st1_x4_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %addr) {
+; CHECK-LABEL: st1_x4_v2i64:
+; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ call void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %addr)
+ ret void
+}
+
+define void @st1_x4_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x double> %D, double* %addr) {
+; CHECK-LABEL: st1_x4_v2f64:
+; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+ call void @llvm.aarch64.neon.st1x4.v2f64.p0f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x double> %D, double* %addr)
+ ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-stack-no-frame.ll b/test/CodeGen/AArch64/arm64-stack-no-frame.ll
new file mode 100644
index 0000000..b5970c0
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-stack-no-frame.ll
@@ -0,0 +1,20 @@
+; RUN: llc -mtriple=arm64-apple-ios7.0 -o - %s | FileCheck %s
+
+@global = global [20 x i64] zeroinitializer, align 8
+
+; The following function has enough locals to need some restoring, but not a
+; frame record. In an intermediate frame refactoring, prologue and epilogue were
+; inconsistent about how much to move SP.
+define void @test_stack_no_frame() {
+; CHECK: test_stack_no_frame
+; CHECK: sub sp, sp, #[[STACKSIZE:[0-9]+]]
+ %local = alloca [20 x i64]
+ %val = load volatile [20 x i64]* @global, align 8
+ store volatile [20 x i64] %val, [20 x i64]* %local, align 8
+
+ %val2 = load volatile [20 x i64]* %local, align 8
+ store volatile [20 x i64] %val2, [20 x i64]* @global, align 8
+
+; CHECK: add sp, sp, #[[STACKSIZE]]
+ ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-stackmap.ll b/test/CodeGen/AArch64/arm64-stackmap.ll
new file mode 100644
index 0000000..2c7c6ae
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-stackmap.ll
@@ -0,0 +1,288 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
+;
+; Note: Print verbose stackmaps using -debug-only=stackmaps.
+
+; We are not getting the correct stack alignment when cross compiling for arm64.
+; So specify a datalayout here.
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; CHECK-LABEL: .section __LLVM_STACKMAPS,__llvm_stackmaps
+; CHECK-NEXT: __LLVM_StackMaps:
+; Header
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .short 0
+; Num Functions
+; CHECK-NEXT: .long 11
+; Num LargeConstants
+; CHECK-NEXT: .long 2
+; Num Callsites
+; CHECK-NEXT: .long 11
+
+; Functions and stack size
+; CHECK-NEXT: .quad _constantargs
+; CHECK-NEXT: .quad 16
+; CHECK-NEXT: .quad _osrinline
+; CHECK-NEXT: .quad 32
+; CHECK-NEXT: .quad _osrcold
+; CHECK-NEXT: .quad 16
+; CHECK-NEXT: .quad _propertyRead
+; CHECK-NEXT: .quad 16
+; CHECK-NEXT: .quad _propertyWrite
+; CHECK-NEXT: .quad 16
+; CHECK-NEXT: .quad _jsVoidCall
+; CHECK-NEXT: .quad 16
+; CHECK-NEXT: .quad _jsIntCall
+; CHECK-NEXT: .quad 16
+; CHECK-NEXT: .quad _spilledValue
+; CHECK-NEXT: .quad 160
+; CHECK-NEXT: .quad _spilledStackMapValue
+; CHECK-NEXT: .quad 128
+; CHECK-NEXT: .quad _liveConstant
+; CHECK-NEXT: .quad 16
+; CHECK-NEXT: .quad _clobberLR
+; CHECK-NEXT: .quad 112
+
+; Num LargeConstants
+; CHECK-NEXT: .quad 4294967295
+; CHECK-NEXT: .quad 4294967296
+
+; Constant arguments
+;
+; CHECK-NEXT: .quad 1
+; CHECK-NEXT: .long L{{.*}}-_constantargs
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .short 4
+; SmallConstant
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .long 65535
+; SmallConstant
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .long 65536
+; SmallConstant
+; CHECK-NEXT: .byte 5
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .long 0
+; LargeConstant at index 0
+; CHECK-NEXT: .byte 5
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .long 1
+
+define void @constantargs() {
+entry:
+ %0 = inttoptr i64 244837814094590 to i8*
+ tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 1, i32 20, i8* %0, i32 0, i64 65535, i64 65536, i64 4294967295, i64 4294967296)
+ ret void
+}
+
+; Inline OSR Exit
+;
+; CHECK-LABEL: .long L{{.*}}-_osrinline
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .short 2
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+define void @osrinline(i64 %a, i64 %b) {
+entry:
+ ; Runtime void->void call.
+ call void inttoptr (i64 244837814094590 to void ()*)()
+ ; Followed by inline OSR patchpoint with 12-byte shadow and 2 live vars.
+ call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 3, i32 12, i64 %a, i64 %b)
+ ret void
+}
+
+; Cold OSR Exit
+;
+; 2 live variables in register.
+;
+; CHECK-LABEL: .long L{{.*}}-_osrcold
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .short 2
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+define void @osrcold(i64 %a, i64 %b) {
+entry:
+ %test = icmp slt i64 %a, %b
+ br i1 %test, label %ret, label %cold
+cold:
+ ; OSR patchpoint with 12-byte nop-slide and 2 live vars.
+ %thunk = inttoptr i64 244837814094590 to i8*
+ call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 4, i32 20, i8* %thunk, i32 0, i64 %a, i64 %b)
+ unreachable
+ret:
+ ret void
+}
+
+; Property Read
+; CHECK-LABEL: .long L{{.*}}-_propertyRead
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .short 0
+;
+; FIXME: There are currently no stackmap entries. After moving to
+; AnyRegCC, we will have entries for the object and return value.
+define i64 @propertyRead(i64* %obj) {
+entry:
+ %resolveRead = inttoptr i64 244837814094590 to i8*
+ %result = call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %resolveRead, i32 1, i64* %obj)
+ %add = add i64 %result, 3
+ ret i64 %add
+}
+
+; Property Write
+; CHECK-LABEL: .long L{{.*}}-_propertyWrite
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .short 2
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+define void @propertyWrite(i64 %dummy1, i64* %obj, i64 %dummy2, i64 %a) {
+entry:
+ %resolveWrite = inttoptr i64 244837814094590 to i8*
+ call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 20, i8* %resolveWrite, i32 2, i64* %obj, i64 %a)
+ ret void
+}
+
+; Void JS Call
+;
+; 2 live variables in registers.
+;
+; CHECK-LABEL: .long L{{.*}}-_jsVoidCall
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .short 2
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+define void @jsVoidCall(i64 %dummy1, i64* %obj, i64 %arg, i64 %l1, i64 %l2) {
+entry:
+ %resolveCall = inttoptr i64 244837814094590 to i8*
+ call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 7, i32 20, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
+ ret void
+}
+
+; i64 JS Call
+;
+; 2 live variables in registers.
+;
+; CHECK-LABEL: .long L{{.*}}-_jsIntCall
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .short 2
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long 0
+define i64 @jsIntCall(i64 %dummy1, i64* %obj, i64 %arg, i64 %l1, i64 %l2) {
+entry:
+ %resolveCall = inttoptr i64 244837814094590 to i8*
+ %result = call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 8, i32 20, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
+ %add = add i64 %result, 3
+ ret i64 %add
+}
+
+; Spilled stack map values.
+;
+; Verify 28 stack map entries.
+;
+; CHECK-LABEL: .long L{{.*}}-_spilledValue
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .short 28
+;
+; Check that at least one is a spilled entry from RBP.
+; Location: Indirect FP + ...
+; CHECK: .byte 3
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short 29
+define void @spilledValue(i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27) {
+entry:
+ call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 11, i32 20, i8* null, i32 5, i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27)
+ ret void
+}
+
+; Spilled stack map values.
+;
+; Verify 23 stack map entries.
+;
+; CHECK-LABEL: .long L{{.*}}-_spilledStackMapValue
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .short 30
+;
+; Check that at least one is a spilled entry from RBP.
+; Location: Indirect FP + ...
+; CHECK: .byte 3
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short 29
+define webkit_jscc void @spilledStackMapValue(i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27, i64 %l28, i64 %l29) {
+entry:
+ call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 12, i32 16, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27, i64 %l28, i64 %l29)
+ ret void
+}
+
+
+; Map a constant value.
+;
+; CHECK-LABEL: .long L{{.*}}-_liveConstant
+; CHECK-NEXT: .short 0
+; 1 location
+; CHECK-NEXT: .short 1
+; Loc 0: SmallConstant
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .long 33
+
+define void @liveConstant() {
+ tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 15, i32 8, i32 33)
+ ret void
+}
+
+; Map a value when LR is the only free register.
+;
+; CHECK-LABEL: .long L{{.*}}-_clobberLR
+; CHECK-NEXT: .short 0
+; 1 location
+; CHECK-NEXT: .short 1
+; Loc 0: Indirect FP (r29) - offset
+; CHECK-NEXT: .byte 3
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .short 29
+; CHECK-NEXT: .long -{{[0-9]+}}
+define void @clobberLR(i32 %a) {
+ tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x31}"() nounwind
+ tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 16, i32 8, i32 %a)
+ ret void
+}
+
+declare void @llvm.experimental.stackmap(i64, i32, ...)
+declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
+declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
diff --git a/test/CodeGen/AArch64/arm64-stackpointer.ll b/test/CodeGen/AArch64/arm64-stackpointer.ll
new file mode 100644
index 0000000..581faf1
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-stackpointer.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s
+
+define i64 @get_stack() nounwind {
+entry:
+; CHECK-LABEL: get_stack:
+; CHECK: mov x0, sp
+ %sp = call i64 @llvm.read_register.i64(metadata !0)
+ ret i64 %sp
+}
+
+define void @set_stack(i64 %val) nounwind {
+entry:
+; CHECK-LABEL: set_stack:
+; CHECK: mov sp, x0
+ call void @llvm.write_register.i64(metadata !0, i64 %val)
+ ret void
+}
+
+declare i64 @llvm.read_register.i64(metadata) nounwind
+declare void @llvm.write_register.i64(metadata, i64) nounwind
+
+; register unsigned long current_stack_pointer asm("sp");
+; CHECK-NOT: .asciz "sp"
+!0 = metadata !{metadata !"sp\00"}
diff --git a/test/CodeGen/AArch64/arm64-stacksave.ll b/test/CodeGen/AArch64/arm64-stacksave.ll
new file mode 100644
index 0000000..a79e99b
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-stacksave.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -verify-coalescing
+; <rdar://problem/11522048>
+target triple = "arm64-apple-macosx10.8.0"
+
+; Verify that we can handle spilling the stack pointer without attempting
+; spilling it directly.
+; CHECK: f
+; CHECK: mov [[X0:x[0-9]+]], sp
+; CHECK: str [[X0]]
+; CHECK: inlineasm
+define void @f() nounwind ssp {
+entry:
+ %savedstack = call i8* @llvm.stacksave() nounwind
+ call void asm sideeffect "; inlineasm", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{fp},~{lr},~{sp},~{memory}"() nounwind
+ call void @llvm.stackrestore(i8* %savedstack) nounwind
+ ret void
+}
+
+declare i8* @llvm.stacksave() nounwind
+declare void @llvm.stackrestore(i8*) nounwind
diff --git a/test/CodeGen/AArch64/arm64-stp.ll b/test/CodeGen/AArch64/arm64-stp.ll
new file mode 100644
index 0000000..40bdf22
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-stp.ll
@@ -0,0 +1,101 @@
+; RUN: llc < %s -march=arm64 -aarch64-stp-suppress=false -verify-machineinstrs -mcpu=cyclone | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-unscaled-mem-op=true\
+; RUN: -verify-machineinstrs -mcpu=cyclone | FileCheck -check-prefix=STUR_CHK %s
+
+; CHECK: stp_int
+; CHECK: stp w0, w1, [x2]
+define void @stp_int(i32 %a, i32 %b, i32* nocapture %p) nounwind {
+ store i32 %a, i32* %p, align 4
+ %add.ptr = getelementptr inbounds i32* %p, i64 1
+ store i32 %b, i32* %add.ptr, align 4
+ ret void
+}
+
+; CHECK: stp_long
+; CHECK: stp x0, x1, [x2]
+define void @stp_long(i64 %a, i64 %b, i64* nocapture %p) nounwind {
+ store i64 %a, i64* %p, align 8
+ %add.ptr = getelementptr inbounds i64* %p, i64 1
+ store i64 %b, i64* %add.ptr, align 8
+ ret void
+}
+
+; CHECK: stp_float
+; CHECK: stp s0, s1, [x0]
+define void @stp_float(float %a, float %b, float* nocapture %p) nounwind {
+ store float %a, float* %p, align 4
+ %add.ptr = getelementptr inbounds float* %p, i64 1
+ store float %b, float* %add.ptr, align 4
+ ret void
+}
+
+; CHECK: stp_double
+; CHECK: stp d0, d1, [x0]
+define void @stp_double(double %a, double %b, double* nocapture %p) nounwind {
+ store double %a, double* %p, align 8
+ %add.ptr = getelementptr inbounds double* %p, i64 1
+ store double %b, double* %add.ptr, align 8
+ ret void
+}
+
+; Test the load/store optimizer---combine ldurs into a ldp, if appropriate
+define void @stur_int(i32 %a, i32 %b, i32* nocapture %p) nounwind {
+; STUR_CHK: stur_int
+; STUR_CHK: stp w{{[0-9]+}}, {{w[0-9]+}}, [x{{[0-9]+}}, #-8]
+; STUR_CHK-NEXT: ret
+ %p1 = getelementptr inbounds i32* %p, i32 -1
+ store i32 %a, i32* %p1, align 2
+ %p2 = getelementptr inbounds i32* %p, i32 -2
+ store i32 %b, i32* %p2, align 2
+ ret void
+}
+
+define void @stur_long(i64 %a, i64 %b, i64* nocapture %p) nounwind {
+; STUR_CHK: stur_long
+; STUR_CHK: stp x{{[0-9]+}}, {{x[0-9]+}}, [x{{[0-9]+}}, #-16]
+; STUR_CHK-NEXT: ret
+ %p1 = getelementptr inbounds i64* %p, i32 -1
+ store i64 %a, i64* %p1, align 2
+ %p2 = getelementptr inbounds i64* %p, i32 -2
+ store i64 %b, i64* %p2, align 2
+ ret void
+}
+
+define void @stur_float(float %a, float %b, float* nocapture %p) nounwind {
+; STUR_CHK: stur_float
+; STUR_CHK: stp s{{[0-9]+}}, {{s[0-9]+}}, [x{{[0-9]+}}, #-8]
+; STUR_CHK-NEXT: ret
+ %p1 = getelementptr inbounds float* %p, i32 -1
+ store float %a, float* %p1, align 2
+ %p2 = getelementptr inbounds float* %p, i32 -2
+ store float %b, float* %p2, align 2
+ ret void
+}
+
+define void @stur_double(double %a, double %b, double* nocapture %p) nounwind {
+; STUR_CHK: stur_double
+; STUR_CHK: stp d{{[0-9]+}}, {{d[0-9]+}}, [x{{[0-9]+}}, #-16]
+; STUR_CHK-NEXT: ret
+ %p1 = getelementptr inbounds double* %p, i32 -1
+ store double %a, double* %p1, align 2
+ %p2 = getelementptr inbounds double* %p, i32 -2
+ store double %b, double* %p2, align 2
+ ret void
+}
+
+define void @splat_v4i32(i32 %v, i32 *%p) {
+entry:
+
+; CHECK-LABEL: splat_v4i32
+; CHECK-DAG: stp w0, w0, [x1]
+; CHECK-DAG: stp w0, w0, [x1, #8]
+; CHECK: ret
+
+ %p17 = insertelement <4 x i32> undef, i32 %v, i32 0
+ %p18 = insertelement <4 x i32> %p17, i32 %v, i32 1
+ %p19 = insertelement <4 x i32> %p18, i32 %v, i32 2
+ %p20 = insertelement <4 x i32> %p19, i32 %v, i32 3
+ %p21 = bitcast i32* %p to <4 x i32>*
+ store <4 x i32> %p20, <4 x i32>* %p21, align 4
+ ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-strict-align.ll b/test/CodeGen/AArch64/arm64-strict-align.ll
new file mode 100644
index 0000000..5d13704
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-strict-align.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-apple-darwin -aarch64-no-strict-align | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-apple-darwin -aarch64-strict-align | FileCheck %s --check-prefix=CHECK-STRICT
+
+define i32 @f0(i32* nocapture %p) nounwind {
+; CHECK-STRICT: ldrh [[HIGH:w[0-9]+]], [x0, #2]
+; CHECK-STRICT: ldrh [[LOW:w[0-9]+]], [x0]
+; CHECK-STRICT: bfi [[LOW]], [[HIGH]], #16, #16
+; CHECK-STRICT: ret
+
+; CHECK: ldr w0, [x0]
+; CHECK: ret
+ %tmp = load i32* %p, align 2
+ ret i32 %tmp
+}
+
+define i64 @f1(i64* nocapture %p) nounwind {
+; CHECK-STRICT: ldp w[[LOW:[0-9]+]], w[[HIGH:[0-9]+]], [x0]
+; CHECK-STRICT: bfi x[[LOW]], x[[HIGH]], #32, #32
+; CHECK-STRICT: ret
+
+; CHECK: ldr x0, [x0]
+; CHECK: ret
+ %tmp = load i64* %p, align 4
+ ret i64 %tmp
+}
diff --git a/test/CodeGen/AArch64/arm64-stur.ll b/test/CodeGen/AArch64/arm64-stur.ll
new file mode 100644
index 0000000..a2e684d
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-stur.ll
@@ -0,0 +1,98 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -mcpu=cyclone | FileCheck %s
+%struct.X = type <{ i32, i64, i64 }>
+
+define void @foo1(i32* %p, i64 %val) nounwind {
+; CHECK-LABEL: foo1:
+; CHECK: stur w1, [x0, #-4]
+; CHECK-NEXT: ret
+ %tmp1 = trunc i64 %val to i32
+ %ptr = getelementptr inbounds i32* %p, i64 -1
+ store i32 %tmp1, i32* %ptr, align 4
+ ret void
+}
+define void @foo2(i16* %p, i64 %val) nounwind {
+; CHECK-LABEL: foo2:
+; CHECK: sturh w1, [x0, #-2]
+; CHECK-NEXT: ret
+ %tmp1 = trunc i64 %val to i16
+ %ptr = getelementptr inbounds i16* %p, i64 -1
+ store i16 %tmp1, i16* %ptr, align 2
+ ret void
+}
+define void @foo3(i8* %p, i64 %val) nounwind {
+; CHECK-LABEL: foo3:
+; CHECK: sturb w1, [x0, #-1]
+; CHECK-NEXT: ret
+ %tmp1 = trunc i64 %val to i8
+ %ptr = getelementptr inbounds i8* %p, i64 -1
+ store i8 %tmp1, i8* %ptr, align 1
+ ret void
+}
+define void @foo4(i16* %p, i32 %val) nounwind {
+; CHECK-LABEL: foo4:
+; CHECK: sturh w1, [x0, #-2]
+; CHECK-NEXT: ret
+ %tmp1 = trunc i32 %val to i16
+ %ptr = getelementptr inbounds i16* %p, i32 -1
+ store i16 %tmp1, i16* %ptr, align 2
+ ret void
+}
+define void @foo5(i8* %p, i32 %val) nounwind {
+; CHECK-LABEL: foo5:
+; CHECK: sturb w1, [x0, #-1]
+; CHECK-NEXT: ret
+ %tmp1 = trunc i32 %val to i8
+ %ptr = getelementptr inbounds i8* %p, i32 -1
+ store i8 %tmp1, i8* %ptr, align 1
+ ret void
+}
+
+define void @foo(%struct.X* nocapture %p) nounwind optsize ssp {
+; CHECK-LABEL: foo:
+; CHECK-NOT: str
+; CHECK: stur xzr, [x0, #12]
+; CHECK-NEXT: stur xzr, [x0, #4]
+; CHECK-NEXT: ret
+ %B = getelementptr inbounds %struct.X* %p, i64 0, i32 1
+ %val = bitcast i64* %B to i8*
+ call void @llvm.memset.p0i8.i64(i8* %val, i8 0, i64 16, i32 1, i1 false)
+ ret void
+}
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
+
+; Unaligned 16b stores are split into 8b stores for performance.
+; radar://15424193
+
+; CHECK-LABEL: unaligned:
+; CHECK-NOT: str q0
+; CHECK: str d[[REG:[0-9]+]], [x0]
+; CHECK: ext.16b v[[REG2:[0-9]+]], v[[REG]], v[[REG]], #8
+; CHECK: str d[[REG2]], [x0, #8]
+define void @unaligned(<4 x i32>* %p, <4 x i32> %v) nounwind {
+ store <4 x i32> %v, <4 x i32>* %p, align 4
+ ret void
+}
+
+; CHECK-LABEL: aligned:
+; CHECK: str q0
+define void @aligned(<4 x i32>* %p, <4 x i32> %v) nounwind {
+ store <4 x i32> %v, <4 x i32>* %p
+ ret void
+}
+
+; Don't split one and two byte aligned stores.
+; radar://16349308
+
+; CHECK-LABEL: twobytealign:
+; CHECK: str q0
+define void @twobytealign(<4 x i32>* %p, <4 x i32> %v) nounwind {
+ store <4 x i32> %v, <4 x i32>* %p, align 2
+ ret void
+}
+; CHECK-LABEL: onebytealign:
+; CHECK: str q0
+define void @onebytealign(<4 x i32>* %p, <4 x i32> %v) nounwind {
+ store <4 x i32> %v, <4 x i32>* %p, align 1
+ ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-subsections.ll b/test/CodeGen/AArch64/arm64-subsections.ll
new file mode 100644
index 0000000..316e7c3
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-subsections.ll
@@ -0,0 +1,5 @@
+; RUN: llc -mtriple=arm64-apple-ios7.0 -o - %s | FileCheck %s --check-prefix=CHECK-MACHO
+; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s --check-prefix=CHECK-ELF
+
+; CHECK-MACHO: .subsections_via_symbols
+; CHECK-ELF-NOT: .subsections_via_symbols \ No newline at end of file
diff --git a/test/CodeGen/AArch64/arm64-subvector-extend.ll b/test/CodeGen/AArch64/arm64-subvector-extend.ll
new file mode 100644
index 0000000..d5a178a
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-subvector-extend.ll
@@ -0,0 +1,141 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+
+; Test efficient codegen of vector extends up from legal type to 128 bit
+; and 256 bit vector types.
+
+;-----
+; Vectors of i16.
+;-----
+define <8 x i16> @func1(<8 x i8> %v0) nounwind {
+; CHECK-LABEL: func1:
+; CHECK-NEXT: ushll.8h v0, v0, #0
+; CHECK-NEXT: ret
+ %r = zext <8 x i8> %v0 to <8 x i16>
+ ret <8 x i16> %r
+}
+
+define <8 x i16> @func2(<8 x i8> %v0) nounwind {
+; CHECK-LABEL: func2:
+; CHECK-NEXT: sshll.8h v0, v0, #0
+; CHECK-NEXT: ret
+ %r = sext <8 x i8> %v0 to <8 x i16>
+ ret <8 x i16> %r
+}
+
+define <16 x i16> @func3(<16 x i8> %v0) nounwind {
+; CHECK-LABEL: func3:
+; CHECK-NEXT: ushll2.8h v1, v0, #0
+; CHECK-NEXT: ushll.8h v0, v0, #0
+; CHECK-NEXT: ret
+ %r = zext <16 x i8> %v0 to <16 x i16>
+ ret <16 x i16> %r
+}
+
+define <16 x i16> @func4(<16 x i8> %v0) nounwind {
+; CHECK-LABEL: func4:
+; CHECK-NEXT: sshll2.8h v1, v0, #0
+; CHECK-NEXT: sshll.8h v0, v0, #0
+; CHECK-NEXT: ret
+ %r = sext <16 x i8> %v0 to <16 x i16>
+ ret <16 x i16> %r
+}
+
+;-----
+; Vectors of i32.
+;-----
+
+define <4 x i32> @afunc1(<4 x i16> %v0) nounwind {
+; CHECK-LABEL: afunc1:
+; CHECK-NEXT: ushll.4s v0, v0, #0
+; CHECK-NEXT: ret
+ %r = zext <4 x i16> %v0 to <4 x i32>
+ ret <4 x i32> %r
+}
+
+define <4 x i32> @afunc2(<4 x i16> %v0) nounwind {
+; CHECK-LABEL: afunc2:
+; CHECK-NEXT: sshll.4s v0, v0, #0
+; CHECK-NEXT: ret
+ %r = sext <4 x i16> %v0 to <4 x i32>
+ ret <4 x i32> %r
+}
+
+define <8 x i32> @afunc3(<8 x i16> %v0) nounwind {
+; CHECK-LABEL: afunc3:
+; CHECK-NEXT: ushll2.4s v1, v0, #0
+; CHECK-NEXT: ushll.4s v0, v0, #0
+; CHECK-NEXT: ret
+ %r = zext <8 x i16> %v0 to <8 x i32>
+ ret <8 x i32> %r
+}
+
+define <8 x i32> @afunc4(<8 x i16> %v0) nounwind {
+; CHECK-LABEL: afunc4:
+; CHECK-NEXT: sshll2.4s v1, v0, #0
+; CHECK-NEXT: sshll.4s v0, v0, #0
+; CHECK-NEXT: ret
+ %r = sext <8 x i16> %v0 to <8 x i32>
+ ret <8 x i32> %r
+}
+
+define <8 x i32> @bfunc1(<8 x i8> %v0) nounwind {
+; CHECK-LABEL: bfunc1:
+; CHECK-NEXT: ushll.8h v0, v0, #0
+; CHECK-NEXT: ushll2.4s v1, v0, #0
+; CHECK-NEXT: ushll.4s v0, v0, #0
+; CHECK-NEXT: ret
+ %r = zext <8 x i8> %v0 to <8 x i32>
+ ret <8 x i32> %r
+}
+
+define <8 x i32> @bfunc2(<8 x i8> %v0) nounwind {
+; CHECK-LABEL: bfunc2:
+; CHECK-NEXT: sshll.8h v0, v0, #0
+; CHECK-NEXT: sshll2.4s v1, v0, #0
+; CHECK-NEXT: sshll.4s v0, v0, #0
+; CHECK-NEXT: ret
+ %r = sext <8 x i8> %v0 to <8 x i32>
+ ret <8 x i32> %r
+}
+
+;-----
+; Vectors of i64.
+;-----
+
+define <4 x i64> @zfunc1(<4 x i32> %v0) nounwind {
+; CHECK-LABEL: zfunc1:
+; CHECK-NEXT: ushll2.2d v1, v0, #0
+; CHECK-NEXT: ushll.2d v0, v0, #0
+; CHECK-NEXT: ret
+ %r = zext <4 x i32> %v0 to <4 x i64>
+ ret <4 x i64> %r
+}
+
+define <4 x i64> @zfunc2(<4 x i32> %v0) nounwind {
+; CHECK-LABEL: zfunc2:
+; CHECK-NEXT: sshll2.2d v1, v0, #0
+; CHECK-NEXT: sshll.2d v0, v0, #0
+; CHECK-NEXT: ret
+ %r = sext <4 x i32> %v0 to <4 x i64>
+ ret <4 x i64> %r
+}
+
+define <4 x i64> @bfunc3(<4 x i16> %v0) nounwind {
+; CHECK-LABEL: func3:
+; CHECK-NEXT: ushll.4s v0, v0, #0
+; CHECK-NEXT: ushll2.2d v1, v0, #0
+; CHECK-NEXT: ushll.2d v0, v0, #0
+; CHECK-NEXT: ret
+ %r = zext <4 x i16> %v0 to <4 x i64>
+ ret <4 x i64> %r
+}
+
+define <4 x i64> @cfunc4(<4 x i16> %v0) nounwind {
+; CHECK-LABEL: func4:
+; CHECK-NEXT: sshll.4s v0, v0, #0
+; CHECK-NEXT: sshll2.2d v1, v0, #0
+; CHECK-NEXT: sshll.2d v0, v0, #0
+; CHECK-NEXT: ret
+ %r = sext <4 x i16> %v0 to <4 x i64>
+ ret <4 x i64> %r
+}
diff --git a/test/CodeGen/AArch64/arm64-swizzle-tbl-i16-layout.ll b/test/CodeGen/AArch64/arm64-swizzle-tbl-i16-layout.ll
new file mode 100644
index 0000000..4ab2bee
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-swizzle-tbl-i16-layout.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
+; rdar://13214163 - Make sure we generate a correct lookup table for the TBL
+; instruction when the element size of the vector is not 8 bits. We were
+; getting both the endianness wrong and the element indexing wrong.
+define <8 x i16> @foo(<8 x i16> %a) nounwind readnone {
+; CHECK: .section __TEXT,__literal16,16byte_literals
+; CHECK: .align 4
+; CHECK:lCPI0_0:
+; CHECK: .byte 0 ; 0x0
+; CHECK: .byte 1 ; 0x1
+; CHECK: .byte 0 ; 0x0
+; CHECK: .byte 1 ; 0x1
+; CHECK: .byte 0 ; 0x0
+; CHECK: .byte 1 ; 0x1
+; CHECK: .byte 0 ; 0x0
+; CHECK: .byte 1 ; 0x1
+; CHECK: .byte 8 ; 0x8
+; CHECK: .byte 9 ; 0x9
+; CHECK: .byte 8 ; 0x8
+; CHECK: .byte 9 ; 0x9
+; CHECK: .byte 8 ; 0x8
+; CHECK: .byte 9 ; 0x9
+; CHECK: .byte 8 ; 0x8
+; CHECK: .byte 9 ; 0x9
+; CHECK: .section __TEXT,__text,regular,pure_instructions
+; CHECK: .globl _foo
+; CHECK: .align 2
+; CHECK:_foo: ; @foo
+; CHECK: adrp [[BASE:x[0-9]+]], lCPI0_0@PAGE
+; CHECK: ldr q[[REG:[0-9]+]], {{\[}}[[BASE]], lCPI0_0@PAGEOFF]
+; CHECK: tbl.16b v0, { v0 }, v[[REG]]
+; CHECK: ret
+
+ %val = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+ ret <8 x i16> %val
+}
diff --git a/test/CodeGen/AArch64/arm64-tbl.ll b/test/CodeGen/AArch64/arm64-tbl.ll
new file mode 100644
index 0000000..b1ce15a
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-tbl.ll
@@ -0,0 +1,132 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @tbl1_8b(<16 x i8> %A, <8 x i8> %B) nounwind {
+; CHECK: tbl1_8b
+; CHECK: tbl.8b
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> %A, <8 x i8> %B)
+ ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbl1_16b(<16 x i8> %A, <16 x i8> %B) nounwind {
+; CHECK: tbl1_16b
+; CHECK: tbl.16b
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %A, <16 x i8> %B)
+ ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbl2_8b(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C) {
+; CHECK: tbl2_8b
+; CHECK: tbl.8b
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C)
+ ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbl2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) {
+; CHECK: tbl2_16b
+; CHECK: tbl.16b
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C)
+ ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbl3_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) {
+; CHECK: tbl3_8b
+; CHECK: tbl.8b
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D)
+ ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbl3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) {
+; CHECK: tbl3_16b
+; CHECK: tbl.16b
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D)
+ ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbl4_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) {
+; CHECK: tbl4_8b
+; CHECK: tbl.8b
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E)
+ ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbl4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) {
+; CHECK: tbl4_16b
+; CHECK: tbl.16b
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E)
+ ret <16 x i8> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+
+define <8 x i8> @tbx1_8b(<8 x i8> %A, <16 x i8> %B, <8 x i8> %C) nounwind {
+; CHECK: tbx1_8b
+; CHECK: tbx.8b
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %A, <16 x i8> %B, <8 x i8> %C)
+ ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbx1_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) nounwind {
+; CHECK: tbx1_16b
+; CHECK: tbx.16b
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C)
+ ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbx2_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) {
+; CHECK: tbx2_8b
+; CHECK: tbx.8b
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D)
+ ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbx2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) {
+; CHECK: tbx2_16b
+; CHECK: tbx.16b
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D)
+ ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbx3_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) {
+; CHECK: tbx3_8b
+; CHECK: tbx.8b
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(< 8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E)
+ ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbx3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) {
+; CHECK: tbx3_16b
+; CHECK: tbx.16b
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E)
+ ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbx4_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <8 x i8> %F) {
+; CHECK: tbx4_8b
+; CHECK: tbx.8b
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <8 x i8> %F)
+ ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbx4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F) {
+; CHECK: tbx4_16b
+; CHECK: tbx.16b
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F)
+ ret <16 x i8> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+
diff --git a/test/CodeGen/AArch64/arm64-this-return.ll b/test/CodeGen/AArch64/arm64-this-return.ll
new file mode 100644
index 0000000..30f5b9b
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-this-return.ll
@@ -0,0 +1,83 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+%struct.A = type { i8 }
+%struct.B = type { i32 }
+%struct.C = type { %struct.B }
+%struct.D = type { %struct.B }
+%struct.E = type { %struct.B, %struct.B }
+
+declare %struct.A* @A_ctor_base(%struct.A* returned)
+declare %struct.B* @B_ctor_base(%struct.B* returned, i32)
+declare %struct.B* @B_ctor_complete(%struct.B* returned, i32)
+
+declare %struct.A* @A_ctor_base_nothisret(%struct.A*)
+declare %struct.B* @B_ctor_base_nothisret(%struct.B*, i32)
+declare %struct.B* @B_ctor_complete_nothisret(%struct.B*, i32)
+
+define %struct.C* @C_ctor_base(%struct.C* returned %this, i32 %x) {
+entry:
+; CHECK-LABEL: C_ctor_base:
+; CHECK-NOT: mov {{x[0-9]+}}, x0
+; CHECK: bl {{_?A_ctor_base}}
+; CHECK-NOT: mov x0, {{x[0-9]+}}
+; CHECK: b {{_?B_ctor_base}}
+ %0 = bitcast %struct.C* %this to %struct.A*
+ %call = tail call %struct.A* @A_ctor_base(%struct.A* %0)
+ %1 = getelementptr inbounds %struct.C* %this, i32 0, i32 0
+ %call2 = tail call %struct.B* @B_ctor_base(%struct.B* %1, i32 %x)
+ ret %struct.C* %this
+}
+
+define %struct.C* @C_ctor_base_nothisret(%struct.C* %this, i32 %x) {
+entry:
+; CHECK-LABEL: C_ctor_base_nothisret:
+; CHECK: mov [[SAVETHIS:x[0-9]+]], x0
+; CHECK: bl {{_?A_ctor_base_nothisret}}
+; CHECK: mov x0, [[SAVETHIS]]
+; CHECK-NOT: b {{_?B_ctor_base_nothisret}}
+ %0 = bitcast %struct.C* %this to %struct.A*
+ %call = tail call %struct.A* @A_ctor_base_nothisret(%struct.A* %0)
+ %1 = getelementptr inbounds %struct.C* %this, i32 0, i32 0
+ %call2 = tail call %struct.B* @B_ctor_base_nothisret(%struct.B* %1, i32 %x)
+ ret %struct.C* %this
+}
+
+define %struct.C* @C_ctor_complete(%struct.C* %this, i32 %x) {
+entry:
+; CHECK-LABEL: C_ctor_complete:
+; CHECK: b {{_?C_ctor_base}}
+ %call = tail call %struct.C* @C_ctor_base(%struct.C* %this, i32 %x)
+ ret %struct.C* %this
+}
+
+define %struct.C* @C_ctor_complete_nothisret(%struct.C* %this, i32 %x) {
+entry:
+; CHECK-LABEL: C_ctor_complete_nothisret:
+; CHECK-NOT: b {{_?C_ctor_base_nothisret}}
+ %call = tail call %struct.C* @C_ctor_base_nothisret(%struct.C* %this, i32 %x)
+ ret %struct.C* %this
+}
+
+define %struct.D* @D_ctor_base(%struct.D* %this, i32 %x) {
+entry:
+; CHECK-LABEL: D_ctor_base:
+; CHECK-NOT: mov {{x[0-9]+}}, x0
+; CHECK: bl {{_?B_ctor_complete}}
+; CHECK-NOT: mov x0, {{x[0-9]+}}
+; CHECK: b {{_?B_ctor_complete}}
+ %b = getelementptr inbounds %struct.D* %this, i32 0, i32 0
+ %call = tail call %struct.B* @B_ctor_complete(%struct.B* %b, i32 %x)
+ %call2 = tail call %struct.B* @B_ctor_complete(%struct.B* %b, i32 %x)
+ ret %struct.D* %this
+}
+
+define %struct.E* @E_ctor_base(%struct.E* %this, i32 %x) {
+entry:
+; CHECK-LABEL: E_ctor_base:
+; CHECK-NOT: b {{_?B_ctor_complete}}
+ %b = getelementptr inbounds %struct.E* %this, i32 0, i32 0
+ %call = tail call %struct.B* @B_ctor_complete(%struct.B* %b, i32 %x)
+ %b2 = getelementptr inbounds %struct.E* %this, i32 0, i32 1
+ %call2 = tail call %struct.B* @B_ctor_complete(%struct.B* %b2, i32 %x)
+ ret %struct.E* %this
+}
diff --git a/test/CodeGen/AArch64/arm64-tls-darwin.ll b/test/CodeGen/AArch64/arm64-tls-darwin.ll
new file mode 100644
index 0000000..5e8ec33
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-tls-darwin.ll
@@ -0,0 +1,18 @@
+; RUN: llc -mtriple=arm64-apple-ios7.0 %s -o - | FileCheck %s
+
+@var = thread_local global i8 0
+
+; N.b. x0 must be the result of the first load (i.e. the address of the
+; descriptor) when tlv_get_addr is called. Likewise the result is returned in
+; x0.
+define i8 @get_var() {
+; CHECK-LABEL: get_var:
+; CHECK: adrp x[[TLVPDESC_SLOT_HI:[0-9]+]], _var@TLVPPAGE
+; CHECK: ldr x0, [x[[TLVPDESC_SLOT_HI]], _var@TLVPPAGEOFF]
+; CHECK: ldr [[TLV_GET_ADDR:x[0-9]+]], [x0]
+; CHECK: blr [[TLV_GET_ADDR]]
+; CHECK: ldrb w0, [x0]
+
+ %val = load i8* @var, align 1
+ ret i8 %val
+}
diff --git a/test/CodeGen/AArch64/tls-dynamic-together.ll b/test/CodeGen/AArch64/arm64-tls-dynamic-together.ll
index b5d7d89..3daae62 100644
--- a/test/CodeGen/AArch64/tls-dynamic-together.ll
+++ b/test/CodeGen/AArch64/arm64-tls-dynamic-together.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O0 -mtriple=aarch64-none-linux-gnu -relocation-model=pic -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -O0 -mtriple=arm64-none-linux-gnu -relocation-model=pic -verify-machineinstrs < %s | FileCheck %s
; If the .tlsdesccall and blr parts are emitted completely separately (even with
; glue) then LLVM will separate them quite happily (with a spill at O0, hence
diff --git a/test/CodeGen/AArch64/tls-dynamics.ll b/test/CodeGen/AArch64/arm64-tls-dynamics.ll
index 68c481c..e8a83fd 100644
--- a/test/CodeGen/AArch64/tls-dynamics.ll
+++ b/test/CodeGen/AArch64/arm64-tls-dynamics.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -filetype=obj < %s | llvm-objdump -r - | FileCheck --check-prefix=CHECK-RELOC %s
+; RUN: llc -mtriple=arm64-none-linux-gnu -relocation-model=pic -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-none-linux-gnu -relocation-model=pic -filetype=obj < %s | llvm-objdump -r - | FileCheck --check-prefix=CHECK-RELOC %s
@general_dynamic_var = external thread_local global i32
@@ -9,18 +9,20 @@ define i32 @test_generaldynamic() {
%val = load i32* @general_dynamic_var
ret i32 %val
+ ; FIXME: the adrp instructions are redundant (if harmless).
+; CHECK: adrp [[TLSDESC_HI:x[0-9]+]], :tlsdesc:general_dynamic_var
+; CHECK: add x0, [[TLSDESC_HI]], :tlsdesc_lo12:general_dynamic_var
; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:general_dynamic_var
-; CHECK-DAG: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:general_dynamic_var
-; CHECK-DAG: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:general_dynamic_var]
+; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:general_dynamic_var]
; CHECK: .tlsdesccall general_dynamic_var
; CHECK-NEXT: blr [[CALLEE]]
-; CHECK: mrs x[[TP:[0-9]+]], tpidr_el0
+; CHECK: mrs x[[TP:[0-9]+]], TPIDR_EL0
; CHECK: ldr w0, [x[[TP]], x0]
; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
-; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_ADD_LO12_NC
-; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_LD64_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
}
@@ -30,20 +32,21 @@ define i32* @test_generaldynamic_addr() {
ret i32* @general_dynamic_var
+ ; FIXME: the adrp instructions are redundant (if harmless).
+; CHECK: adrp [[TLSDESC_HI:x[0-9]+]], :tlsdesc:general_dynamic_var
+; CHECK: add x0, [[TLSDESC_HI]], :tlsdesc_lo12:general_dynamic_var
; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:general_dynamic_var
-; CHECK-DAG: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:general_dynamic_var
-; CHECK-DAG: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:general_dynamic_var]
+; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:general_dynamic_var]
; CHECK: .tlsdesccall general_dynamic_var
; CHECK-NEXT: blr [[CALLEE]]
-; CHECK: mrs [[TP:x[0-9]+]], tpidr_el0
+; CHECK: mrs [[TP:x[0-9]+]], TPIDR_EL0
; CHECK: add x0, [[TP]], x0
; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
-; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_ADD_LO12_NC
-; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_LD64_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
-
}
@local_dynamic_var = external thread_local(localdynamic) global i32
@@ -55,19 +58,24 @@ define i32 @test_localdynamic() {
ret i32 %val
; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
-; CHECK-DAG: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_
-; CHECK-DAG: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_]
+; CHECK: add x0, x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
+; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_]
; CHECK: .tlsdesccall _TLS_MODULE_BASE_
; CHECK-NEXT: blr [[CALLEE]]
; CHECK: movz [[DTP_OFFSET:x[0-9]+]], #:dtprel_g1:local_dynamic_var
; CHECK: movk [[DTP_OFFSET]], #:dtprel_g0_nc:local_dynamic_var
-; CHECK: ldr w0, [x0, [[DTP_OFFSET]]]
+; CHECK: add x[[TPREL:[0-9]+]], x0, [[DTP_OFFSET]]
+
+; CHECK: mrs x[[TPIDR:[0-9]+]], TPIDR_EL0
+
+; CHECK: ldr w0, [x[[TPIDR]], x[[TPREL]]]
; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
-; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_ADD_LO12_NC
-; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_LD64_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
}
@@ -78,19 +86,24 @@ define i32* @test_localdynamic_addr() {
ret i32* @local_dynamic_var
; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
-; CHECK-DAG: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_
-; CHECK-DAG: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_]
+; CHECK: add x0, x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
+; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_]
; CHECK: .tlsdesccall _TLS_MODULE_BASE_
; CHECK-NEXT: blr [[CALLEE]]
; CHECK: movz [[DTP_OFFSET:x[0-9]+]], #:dtprel_g1:local_dynamic_var
; CHECK: movk [[DTP_OFFSET]], #:dtprel_g0_nc:local_dynamic_var
-; CHECK: add x0, x0, [[DTP_OFFSET]]
+; CHECK: add [[TPREL:x[0-9]+]], x0, [[DTP_OFFSET]]
+
+; CHECK: mrs [[TPIDR:x[0-9]+]], TPIDR_EL0
+
+; CHECK: add x0, [[TPIDR]], [[TPREL]]
; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
-; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_ADD_LO12_NC
-; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_LD64_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
}
@@ -110,8 +123,9 @@ define i32 @test_localdynamic_deduplicate() {
ret i32 %sum
; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
-; CHECK-DAG: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_
-; CHECK-DAG: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_]
+; CHECK: add x0, x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
+; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_]
; CHECK: .tlsdesccall _TLS_MODULE_BASE_
; CHECK-NEXT: blr [[CALLEE]]
diff --git a/test/CodeGen/AArch64/tls-execs.ll b/test/CodeGen/AArch64/arm64-tls-execs.ll
index 39ceb9a..f0130d8 100644
--- a/test/CodeGen/AArch64/tls-execs.ll
+++ b/test/CodeGen/AArch64/arm64-tls-execs.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -show-mc-encoding < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-none-linux-gnu -filetype=obj < %s | llvm-objdump -r - | FileCheck --check-prefix=CHECK-RELOC %s
+; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs -show-mc-encoding < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-none-linux-gnu -filetype=obj < %s | llvm-objdump -r - | FileCheck --check-prefix=CHECK-RELOC %s
@initial_exec_var = external thread_local(initialexec) global i32
@@ -8,8 +8,8 @@ define i32 @test_initial_exec() {
%val = load i32* @initial_exec_var
; CHECK: adrp x[[GOTADDR:[0-9]+]], :gottprel:initial_exec_var
-; CHECK: ldr x[[TP_OFFSET:[0-9]+]], [x[[GOTADDR]], #:gottprel_lo12:initial_exec_var]
-; CHECK: mrs x[[TP:[0-9]+]], tpidr_el0
+; CHECK: ldr x[[TP_OFFSET:[0-9]+]], [x[[GOTADDR]], :gottprel_lo12:initial_exec_var]
+; CHECK: mrs x[[TP:[0-9]+]], TPIDR_EL0
; CHECK: ldr w0, [x[[TP]], x[[TP_OFFSET]]]
; CHECK-RELOC: R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21
@@ -23,8 +23,8 @@ define i32* @test_initial_exec_addr() {
ret i32* @initial_exec_var
; CHECK: adrp x[[GOTADDR:[0-9]+]], :gottprel:initial_exec_var
-; CHECK: ldr [[TP_OFFSET:x[0-9]+]], [x[[GOTADDR]], #:gottprel_lo12:initial_exec_var]
-; CHECK: mrs [[TP:x[0-9]+]], tpidr_el0
+; CHECK: ldr [[TP_OFFSET:x[0-9]+]], [x[[GOTADDR]], :gottprel_lo12:initial_exec_var]
+; CHECK: mrs [[TP:x[0-9]+]], TPIDR_EL0
; CHECK: add x0, [[TP]], [[TP_OFFSET]]
; CHECK-RELOC: R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21
@@ -32,15 +32,15 @@ define i32* @test_initial_exec_addr() {
}
-@local_exec_var = thread_local(initialexec) global i32 0
+@local_exec_var = thread_local(localexec) global i32 0
define i32 @test_local_exec() {
; CHECK-LABEL: test_local_exec:
%val = load i32* @local_exec_var
-; CHECK: movz [[TP_OFFSET:x[0-9]+]], #:tprel_g1:local_exec_var // encoding: [A,A,0xa0'A',0x92'A']
+; CHECK: movz [[TP_OFFSET:x[0-9]+]], #:tprel_g1:local_exec_var // encoding: [0bAAA{{[01]+}},A,0b101AAAAA,0x92]
; CHECK: movk [[TP_OFFSET]], #:tprel_g0_nc:local_exec_var
-; CHECK: mrs x[[TP:[0-9]+]], tpidr_el0
+; CHECK: mrs x[[TP:[0-9]+]], TPIDR_EL0
; CHECK: ldr w0, [x[[TP]], [[TP_OFFSET]]]
; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G1
@@ -55,7 +55,7 @@ define i32* @test_local_exec_addr() {
; CHECK: movz [[TP_OFFSET:x[0-9]+]], #:tprel_g1:local_exec_var
; CHECK: movk [[TP_OFFSET]], #:tprel_g0_nc:local_exec_var
-; CHECK: mrs [[TP:x[0-9]+]], tpidr_el0
+; CHECK: mrs [[TP:x[0-9]+]], TPIDR_EL0
; CHECK: add x0, [[TP]], [[TP_OFFSET]]
; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G1
diff --git a/test/CodeGen/AArch64/arm64-trap.ll b/test/CodeGen/AArch64/arm64-trap.ll
new file mode 100644
index 0000000..5e99c32
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-trap.ll
@@ -0,0 +1,8 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+define void @foo() nounwind {
+; CHECK: foo
+; CHECK: brk #0x1
+ tail call void @llvm.trap()
+ ret void
+}
+declare void @llvm.trap() nounwind
diff --git a/test/CodeGen/AArch64/arm64-trn.ll b/test/CodeGen/AArch64/arm64-trn.ll
new file mode 100644
index 0000000..2db7a14
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-trn.ll
@@ -0,0 +1,134 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @vtrni8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vtrni8:
+;CHECK: trn1.8b
+;CHECK: trn2.8b
+;CHECK-NEXT: add.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ %tmp5 = add <8 x i8> %tmp3, %tmp4
+ ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @vtrni16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: vtrni16:
+;CHECK: trn1.4h
+;CHECK: trn2.4h
+;CHECK-NEXT: add.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ %tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ %tmp5 = add <4 x i16> %tmp3, %tmp4
+ ret <4 x i16> %tmp5
+}
+
+; 2xi32 TRN is redundant with ZIP
+define <2 x i32> @vtrni32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: vtrni32:
+;CHECK: zip1.2s
+;CHECK: zip2.2s
+;CHECK-NEXT: add.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> <i32 0, i32 2>
+ %tmp4 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 3>
+ %tmp5 = add <2 x i32> %tmp3, %tmp4
+ ret <2 x i32> %tmp5
+}
+
+define <2 x float> @vtrnf(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: vtrnf:
+;CHECK: zip1.2s
+;CHECK: zip2.2s
+;CHECK-NEXT: fadd.2s
+ %tmp1 = load <2 x float>* %A
+ %tmp2 = load <2 x float>* %B
+ %tmp3 = shufflevector <2 x float> %tmp1, <2 x float> %tmp2, <2 x i32> <i32 0, i32 2>
+ %tmp4 = shufflevector <2 x float> %tmp1, <2 x float> %tmp2, <2 x i32> <i32 1, i32 3>
+ %tmp5 = fadd <2 x float> %tmp3, %tmp4
+ ret <2 x float> %tmp5
+}
+
+define <16 x i8> @vtrnQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: vtrnQi8:
+;CHECK: trn1.16b
+;CHECK: trn2.16b
+;CHECK-NEXT: add.16b
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+ %tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+ %tmp5 = add <16 x i8> %tmp3, %tmp4
+ ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @vtrnQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: vtrnQi16:
+;CHECK: trn1.8h
+;CHECK: trn2.8h
+;CHECK-NEXT: add.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ %tmp5 = add <8 x i16> %tmp3, %tmp4
+ ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @vtrnQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: vtrnQi32:
+;CHECK: trn1.4s
+;CHECK: trn2.4s
+;CHECK-NEXT: add.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ %tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ %tmp5 = add <4 x i32> %tmp3, %tmp4
+ ret <4 x i32> %tmp5
+}
+
+define <4 x float> @vtrnQf(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: vtrnQf:
+;CHECK: trn1.4s
+;CHECK: trn2.4s
+;CHECK-NEXT: fadd.4s
+ %tmp1 = load <4 x float>* %A
+ %tmp2 = load <4 x float>* %B
+ %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ %tmp4 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ %tmp5 = fadd <4 x float> %tmp3, %tmp4
+ ret <4 x float> %tmp5
+}
+
+; Undef shuffle indices should not prevent matching to VTRN:
+
+define <8 x i8> @vtrni8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vtrni8_undef:
+;CHECK: trn1.8b
+;CHECK: trn2.8b
+;CHECK-NEXT: add.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 undef, i32 2, i32 10, i32 undef, i32 12, i32 6, i32 14>
+ %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 undef, i32 undef, i32 15>
+ %tmp5 = add <8 x i8> %tmp3, %tmp4
+ ret <8 x i8> %tmp5
+}
+
+define <8 x i16> @vtrnQi16_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: vtrnQi16_undef:
+;CHECK: trn1.8h
+;CHECK: trn2.8h
+;CHECK-NEXT: add.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 6, i32 14>
+ %tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 undef, i32 3, i32 11, i32 5, i32 13, i32 undef, i32 undef>
+ %tmp5 = add <8 x i16> %tmp3, %tmp4
+ ret <8 x i16> %tmp5
+}
diff --git a/test/CodeGen/AArch64/arm64-trunc-store.ll b/test/CodeGen/AArch64/arm64-trunc-store.ll
new file mode 100644
index 0000000..cf15247
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-trunc-store.ll
@@ -0,0 +1,75 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
+
+define void @bar(<8 x i16> %arg, <8 x i8>* %p) nounwind {
+; CHECK-LABEL: bar:
+; CHECK: xtn.8b v[[REG:[0-9]+]], v0
+; CHECK-NEXT: str d[[REG]], [x0]
+; CHECK-NEXT: ret
+ %tmp = trunc <8 x i16> %arg to <8 x i8>
+ store <8 x i8> %tmp, <8 x i8>* %p, align 8
+ ret void
+}
+
+@zptr8 = common global i8* null, align 8
+@zptr16 = common global i16* null, align 8
+@zptr32 = common global i32* null, align 8
+
+define void @fct32(i32 %arg, i64 %var) {
+; CHECK: fct32
+; CHECK: adrp [[GLOBALPAGE:x[0-9]+]], _zptr32@GOTPAGE
+; CHECK: ldr [[GLOBALOFF:x[0-9]+]], {{\[}}[[GLOBALPAGE]], _zptr32@GOTPAGEOFF]
+; CHECK: ldr [[GLOBALADDR:x[0-9]+]], {{\[}}[[GLOBALOFF]]]
+; w0 is %arg
+; CHECK-NEXT: sub w[[OFFSETREGNUM:[0-9]+]], w0, #1
+; w1 is %var truncated
+; CHECK-NEXT: str w1, {{\[}}[[GLOBALADDR]], w[[OFFSETREGNUM]], sxtw #2]
+; CHECK-NEXT: ret
+bb:
+ %.pre37 = load i32** @zptr32, align 8
+ %dec = add nsw i32 %arg, -1
+ %idxprom8 = sext i32 %dec to i64
+ %arrayidx9 = getelementptr inbounds i32* %.pre37, i64 %idxprom8
+ %tmp = trunc i64 %var to i32
+ store i32 %tmp, i32* %arrayidx9, align 4
+ ret void
+}
+
+define void @fct16(i32 %arg, i64 %var) {
+; CHECK: fct16
+; CHECK: adrp [[GLOBALPAGE:x[0-9]+]], _zptr16@GOTPAGE
+; CHECK: ldr [[GLOBALOFF:x[0-9]+]], {{\[}}[[GLOBALPAGE]], _zptr16@GOTPAGEOFF]
+; CHECK: ldr [[GLOBALADDR:x[0-9]+]], {{\[}}[[GLOBALOFF]]]
+; w0 is %arg
+; CHECK-NEXT: sub w[[OFFSETREGNUM:[0-9]+]], w0, #1
+; w1 is %var truncated
+; CHECK-NEXT: strh w1, {{\[}}[[GLOBALADDR]], w[[OFFSETREGNUM]], sxtw #1]
+; CHECK-NEXT: ret
+bb:
+ %.pre37 = load i16** @zptr16, align 8
+ %dec = add nsw i32 %arg, -1
+ %idxprom8 = sext i32 %dec to i64
+ %arrayidx9 = getelementptr inbounds i16* %.pre37, i64 %idxprom8
+ %tmp = trunc i64 %var to i16
+ store i16 %tmp, i16* %arrayidx9, align 4
+ ret void
+}
+
+define void @fct8(i32 %arg, i64 %var) {
+; CHECK: fct8
+; CHECK: adrp [[GLOBALPAGE:x[0-9]+]], _zptr8@GOTPAGE
+; CHECK: ldr [[GLOBALOFF:x[0-9]+]], {{\[}}[[GLOBALPAGE]], _zptr8@GOTPAGEOFF]
+; CHECK: ldr [[BASEADDR:x[0-9]+]], {{\[}}[[GLOBALOFF]]]
+; w0 is %arg
+; CHECK-NEXT: add [[ADDR:x[0-9]+]], [[BASEADDR]], w0, sxtw
+; w1 is %var truncated
+; CHECK-NEXT: sturb w1, {{\[}}[[ADDR]], #-1]
+; CHECK-NEXT: ret
+bb:
+ %.pre37 = load i8** @zptr8, align 8
+ %dec = add nsw i32 %arg, -1
+ %idxprom8 = sext i32 %dec to i64
+ %arrayidx9 = getelementptr inbounds i8* %.pre37, i64 %idxprom8
+ %tmp = trunc i64 %var to i8
+ store i8 %tmp, i8* %arrayidx9, align 4
+ ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-umaxv.ll b/test/CodeGen/AArch64/arm64-umaxv.ll
new file mode 100644
index 0000000..d523f31
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-umaxv.ll
@@ -0,0 +1,92 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define i32 @vmax_u8x8(<8 x i8> %a) nounwind ssp {
+; CHECK-LABEL: vmax_u8x8:
+; CHECK: umaxv.8b b[[REG:[0-9]+]], v0
+; CHECK: fmov [[REG2:w[0-9]+]], s[[REG]]
+; CHECK-NOT: and
+; CHECK: cbz [[REG2]],
+entry:
+ %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8> %a) nounwind
+ %tmp = trunc i32 %vmaxv.i to i8
+ %tobool = icmp eq i8 %tmp, 0
+ br i1 %tobool, label %return, label %if.then
+
+if.then:
+ %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
+ br label %return
+
+return:
+ %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+ ret i32 %retval.0
+}
+
+declare i32 @bar(...)
+
+define i32 @vmax_u4x16(<4 x i16> %a) nounwind ssp {
+; CHECK-LABEL: vmax_u4x16:
+; CHECK: umaxv.4h h[[REG:[0-9]+]], v0
+; CHECK: fmov [[REG2:w[0-9]+]], s[[REG]]
+; CHECK-NOT: and
+; CHECK: cbz [[REG2]],
+entry:
+ %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v4i16(<4 x i16> %a) nounwind
+ %tmp = trunc i32 %vmaxv.i to i16
+ %tobool = icmp eq i16 %tmp, 0
+ br i1 %tobool, label %return, label %if.then
+
+if.then:
+ %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
+ br label %return
+
+return:
+ %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+ ret i32 %retval.0
+}
+
+define i32 @vmax_u8x16(<8 x i16> %a) nounwind ssp {
+; CHECK-LABEL: vmax_u8x16:
+; CHECK: umaxv.8h h[[REG:[0-9]+]], v0
+; CHECK: fmov [[REG2:w[0-9]+]], s[[REG]]
+; CHECK-NOT: and
+; CHECK: cbz [[REG2]],
+entry:
+ %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v8i16(<8 x i16> %a) nounwind
+ %tmp = trunc i32 %vmaxv.i to i16
+ %tobool = icmp eq i16 %tmp, 0
+ br i1 %tobool, label %return, label %if.then
+
+if.then:
+ %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
+ br label %return
+
+return:
+ %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+ ret i32 %retval.0
+}
+
+define i32 @vmax_u16x8(<16 x i8> %a) nounwind ssp {
+; CHECK-LABEL: vmax_u16x8:
+; CHECK: umaxv.16b b[[REG:[0-9]+]], v0
+; CHECK: fmov [[REG2:w[0-9]+]], s[[REG]]
+; CHECK-NOT: and
+; CHECK: cbz [[REG2]],
+entry:
+ %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8> %a) nounwind
+ %tmp = trunc i32 %vmaxv.i to i8
+ %tobool = icmp eq i8 %tmp, 0
+ br i1 %tobool, label %return, label %if.then
+
+if.then:
+ %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
+ br label %return
+
+return:
+ %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+ ret i32 %retval.0
+}
+
+declare i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8>) nounwind readnone
+declare i32 @llvm.aarch64.neon.umaxv.i32.v8i16(<8 x i16>) nounwind readnone
+declare i32 @llvm.aarch64.neon.umaxv.i32.v4i16(<4 x i16>) nounwind readnone
+declare i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8>) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-uminv.ll b/test/CodeGen/AArch64/arm64-uminv.ll
new file mode 100644
index 0000000..3bade4b
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-uminv.ll
@@ -0,0 +1,92 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define i32 @vmin_u8x8(<8 x i8> %a) nounwind ssp {
+; CHECK-LABEL: vmin_u8x8:
+; CHECK: uminv.8b b[[REG:[0-9]+]], v0
+; CHECK: fmov [[REG2:w[0-9]+]], s[[REG]]
+; CHECK-NOT: and
+; CHECK: cbz [[REG2]],
+entry:
+ %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8> %a) nounwind
+ %tmp = trunc i32 %vminv.i to i8
+ %tobool = icmp eq i8 %tmp, 0
+ br i1 %tobool, label %return, label %if.then
+
+if.then:
+ %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
+ br label %return
+
+return:
+ %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+ ret i32 %retval.0
+}
+
+declare i32 @bar(...)
+
+define i32 @vmin_u4x16(<4 x i16> %a) nounwind ssp {
+; CHECK-LABEL: vmin_u4x16:
+; CHECK: uminv.4h h[[REG:[0-9]+]], v0
+; CHECK: fmov [[REG2:w[0-9]+]], s[[REG]]
+; CHECK-NOT: and
+; CHECK: cbz [[REG2]],
+entry:
+ %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16> %a) nounwind
+ %tmp = trunc i32 %vminv.i to i16
+ %tobool = icmp eq i16 %tmp, 0
+ br i1 %tobool, label %return, label %if.then
+
+if.then:
+ %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
+ br label %return
+
+return:
+ %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+ ret i32 %retval.0
+}
+
+define i32 @vmin_u8x16(<8 x i16> %a) nounwind ssp {
+; CHECK-LABEL: vmin_u8x16:
+; CHECK: uminv.8h h[[REG:[0-9]+]], v0
+; CHECK: fmov [[REG2:w[0-9]+]], s[[REG]]
+; CHECK-NOT: and
+; CHECK: cbz [[REG2]],
+entry:
+ %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16> %a) nounwind
+ %tmp = trunc i32 %vminv.i to i16
+ %tobool = icmp eq i16 %tmp, 0
+ br i1 %tobool, label %return, label %if.then
+
+if.then:
+ %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
+ br label %return
+
+return:
+ %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+ ret i32 %retval.0
+}
+
+define i32 @vmin_u16x8(<16 x i8> %a) nounwind ssp {
+; CHECK-LABEL: vmin_u16x8:
+; CHECK: uminv.16b b[[REG:[0-9]+]], v0
+; CHECK: fmov [[REG2:w[0-9]+]], s[[REG]]
+; CHECK-NOT: and
+; CHECK: cbz [[REG2]],
+entry:
+ %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8> %a) nounwind
+ %tmp = trunc i32 %vminv.i to i8
+ %tobool = icmp eq i8 %tmp, 0
+ br i1 %tobool, label %return, label %if.then
+
+if.then:
+ %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
+ br label %return
+
+return:
+ %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+ ret i32 %retval.0
+}
+
+declare i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8>) nounwind readnone
+declare i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16>) nounwind readnone
+declare i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16>) nounwind readnone
+declare i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8>) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-umov.ll b/test/CodeGen/AArch64/arm64-umov.ll
new file mode 100644
index 0000000..a1ef990
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-umov.ll
@@ -0,0 +1,33 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define zeroext i8 @f1(<16 x i8> %a) {
+; CHECK-LABEL: f1:
+; CHECK: mov.b w0, v0[3]
+; CHECK-NEXT: ret
+ %vecext = extractelement <16 x i8> %a, i32 3
+ ret i8 %vecext
+}
+
+define zeroext i16 @f2(<4 x i16> %a) {
+; CHECK-LABEL: f2:
+; CHECK: mov.h w0, v0[2]
+; CHECK-NEXT: ret
+ %vecext = extractelement <4 x i16> %a, i32 2
+ ret i16 %vecext
+}
+
+define i32 @f3(<2 x i32> %a) {
+; CHECK-LABEL: f3:
+; CHECK: mov.s w0, v0[1]
+; CHECK-NEXT: ret
+ %vecext = extractelement <2 x i32> %a, i32 1
+ ret i32 %vecext
+}
+
+define i64 @f4(<2 x i64> %a) {
+; CHECK-LABEL: f4:
+; CHECK: mov.d x0, v0[1]
+; CHECK-NEXT: ret
+ %vecext = extractelement <2 x i64> %a, i32 1
+ ret i64 %vecext
+}
diff --git a/test/CodeGen/AArch64/arm64-unaligned_ldst.ll b/test/CodeGen/AArch64/arm64-unaligned_ldst.ll
new file mode 100644
index 0000000..20b80c0
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-unaligned_ldst.ll
@@ -0,0 +1,41 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+; rdar://r11231896
+
+define void @t1(i8* nocapture %a, i8* nocapture %b) nounwind {
+entry:
+; CHECK-LABEL: t1:
+; CHECK-NOT: orr
+; CHECK: ldr [[X0:x[0-9]+]], [x1]
+; CHECK: str [[X0]], [x0]
+ %tmp1 = bitcast i8* %b to i64*
+ %tmp2 = bitcast i8* %a to i64*
+ %tmp3 = load i64* %tmp1, align 1
+ store i64 %tmp3, i64* %tmp2, align 1
+ ret void
+}
+
+define void @t2(i8* nocapture %a, i8* nocapture %b) nounwind {
+entry:
+; CHECK-LABEL: t2:
+; CHECK-NOT: orr
+; CHECK: ldr [[W0:w[0-9]+]], [x1]
+; CHECK: str [[W0]], [x0]
+ %tmp1 = bitcast i8* %b to i32*
+ %tmp2 = bitcast i8* %a to i32*
+ %tmp3 = load i32* %tmp1, align 1
+ store i32 %tmp3, i32* %tmp2, align 1
+ ret void
+}
+
+define void @t3(i8* nocapture %a, i8* nocapture %b) nounwind {
+entry:
+; CHECK-LABEL: t3:
+; CHECK-NOT: orr
+; CHECK: ldrh [[W0:w[0-9]+]], [x1]
+; CHECK: strh [[W0]], [x0]
+ %tmp1 = bitcast i8* %b to i16*
+ %tmp2 = bitcast i8* %a to i16*
+ %tmp3 = load i16* %tmp1, align 1
+ store i16 %tmp3, i16* %tmp2, align 1
+ ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-uzp.ll b/test/CodeGen/AArch64/arm64-uzp.ll
new file mode 100644
index 0000000..cdd8d31
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-uzp.ll
@@ -0,0 +1,107 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @vuzpi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vuzpi8:
+;CHECK: uzp1.8b
+;CHECK: uzp2.8b
+;CHECK-NEXT: add.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+ %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+ %tmp5 = add <8 x i8> %tmp3, %tmp4
+ ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @vuzpi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: vuzpi16:
+;CHECK: uzp1.4h
+;CHECK: uzp2.4h
+;CHECK-NEXT: add.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+ %tmp5 = add <4 x i16> %tmp3, %tmp4
+ ret <4 x i16> %tmp5
+}
+
+define <16 x i8> @vuzpQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: vuzpQi8:
+;CHECK: uzp1.16b
+;CHECK: uzp2.16b
+;CHECK-NEXT: add.16b
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+ %tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+ %tmp5 = add <16 x i8> %tmp3, %tmp4
+ ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @vuzpQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: vuzpQi16:
+;CHECK: uzp1.8h
+;CHECK: uzp2.8h
+;CHECK-NEXT: add.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+ %tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+ %tmp5 = add <8 x i16> %tmp3, %tmp4
+ ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @vuzpQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: vuzpQi32:
+;CHECK: uzp1.4s
+;CHECK: uzp2.4s
+;CHECK-NEXT: add.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+ %tmp5 = add <4 x i32> %tmp3, %tmp4
+ ret <4 x i32> %tmp5
+}
+
+define <4 x float> @vuzpQf(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: vuzpQf:
+;CHECK: uzp1.4s
+;CHECK: uzp2.4s
+;CHECK-NEXT: fadd.4s
+ %tmp1 = load <4 x float>* %A
+ %tmp2 = load <4 x float>* %B
+ %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %tmp4 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+ %tmp5 = fadd <4 x float> %tmp3, %tmp4
+ ret <4 x float> %tmp5
+}
+
+; Undef shuffle indices should not prevent matching to VUZP:
+
+define <8 x i8> @vuzpi8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vuzpi8_undef:
+;CHECK: uzp1.8b
+;CHECK: uzp2.8b
+;CHECK-NEXT: add.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14>
+ %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 13, i32 15>
+ %tmp5 = add <8 x i8> %tmp3, %tmp4
+ ret <8 x i8> %tmp5
+}
+
+define <8 x i16> @vuzpQi16_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: vuzpQi16_undef:
+;CHECK: uzp1.8h
+;CHECK: uzp2.8h
+;CHECK-NEXT: add.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 undef, i32 4, i32 undef, i32 8, i32 10, i32 12, i32 14>
+ %tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 undef, i32 undef, i32 11, i32 13, i32 15>
+ %tmp5 = add <8 x i16> %tmp3, %tmp4
+ ret <8 x i16> %tmp5
+}
diff --git a/test/CodeGen/AArch64/arm64-vaargs.ll b/test/CodeGen/AArch64/arm64-vaargs.ll
new file mode 100644
index 0000000..ce07635
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vaargs.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64"
+target triple = "arm64-apple-darwin11.0.0"
+
+define float @t1(i8* nocapture %fmt, ...) nounwind ssp {
+entry:
+; CHECK: t1
+; CHECK: fcvt
+ %argp = alloca i8*, align 8
+ %argp1 = bitcast i8** %argp to i8*
+ call void @llvm.va_start(i8* %argp1)
+ %0 = va_arg i8** %argp, i32
+ %1 = va_arg i8** %argp, float
+ call void @llvm.va_end(i8* %argp1)
+ ret float %1
+}
+
+declare void @llvm.va_start(i8*) nounwind
+
+declare void @llvm.va_end(i8*) nounwind
diff --git a/test/CodeGen/AArch64/arm64-vabs.ll b/test/CodeGen/AArch64/arm64-vabs.ll
new file mode 100644
index 0000000..5afc8d9
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vabs.ll
@@ -0,0 +1,804 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+
+define <8 x i16> @sabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: sabdl8h:
+;CHECK: sabdl.8h
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @sabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sabdl4s:
+;CHECK: sabdl.4s
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @sabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sabdl2d:
+;CHECK: sabdl.2d
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @sabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: sabdl2_8h:
+;CHECK: sabdl2.8h
+ %load1 = load <16 x i8>* %A
+ %load2 = load <16 x i8>* %B
+ %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @sabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sabdl2_4s:
+;CHECK: sabdl2.4s
+ %load1 = load <8 x i16>* %A
+ %load2 = load <8 x i16>* %B
+ %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @sabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sabdl2_2d:
+;CHECK: sabdl2.2d
+ %load1 = load <4 x i32>* %A
+ %load2 = load <4 x i32>* %B
+ %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @uabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uabdl8h:
+;CHECK: uabdl.8h
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @uabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uabdl4s:
+;CHECK: uabdl.4s
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @uabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uabdl2d:
+;CHECK: uabdl.2d
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @uabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uabdl2_8h:
+;CHECK: uabdl2.8h
+ %load1 = load <16 x i8>* %A
+ %load2 = load <16 x i8>* %B
+ %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @uabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uabdl2_4s:
+;CHECK: uabdl2.4s
+ %load1 = load <8 x i16>* %A
+ %load2 = load <8 x i16>* %B
+ %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @uabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uabdl2_2d:
+;CHECK: uabdl2.2d
+ %load1 = load <4 x i32>* %A
+ %load2 = load <4 x i32>* %B
+ %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <2 x float> @fabd_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fabd_2s:
+;CHECK: fabd.2s
+ %tmp1 = load <2 x float>* %A
+ %tmp2 = load <2 x float>* %B
+ %tmp3 = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+ ret <2 x float> %tmp3
+}
+
+define <4 x float> @fabd_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fabd_4s:
+;CHECK: fabd.4s
+ %tmp1 = load <4 x float>* %A
+ %tmp2 = load <4 x float>* %B
+ %tmp3 = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+ ret <4 x float> %tmp3
+}
+
+define <2 x double> @fabd_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fabd_2d:
+;CHECK: fabd.2d
+ %tmp1 = load <2 x double>* %A
+ %tmp2 = load <2 x double>* %B
+ %tmp3 = call <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+ ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <8 x i8> @sabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: sabd_8b:
+;CHECK: sabd.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @sabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: sabd_16b:
+;CHECK: sabd.16b
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @sabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sabd_4h:
+;CHECK: sabd.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @sabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sabd_8h:
+;CHECK: sabd.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @sabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sabd_2s:
+;CHECK: sabd.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @sabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sabd_4s:
+;CHECK: sabd.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i8> @uabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uabd_8b:
+;CHECK: uabd.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @uabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uabd_16b:
+;CHECK: uabd.16b
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @uabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uabd_4h:
+;CHECK: uabd.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @uabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uabd_8h:
+;CHECK: uabd.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @uabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uabd_2s:
+;CHECK: uabd.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @uabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uabd_4s:
+;CHECK: uabd.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i8> @sqabs_8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: sqabs_8b:
+;CHECK: sqabs.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> %tmp1)
+ ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @sqabs_16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: sqabs_16b:
+;CHECK: sqabs.16b
+ %tmp1 = load <16 x i8>* %A
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8> %tmp1)
+ ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @sqabs_4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: sqabs_4h:
+;CHECK: sqabs.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> %tmp1)
+ ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @sqabs_8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqabs_8h:
+;CHECK: sqabs.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16> %tmp1)
+ ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @sqabs_2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: sqabs_2s:
+;CHECK: sqabs.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32> %tmp1)
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @sqabs_4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqabs_4s:
+;CHECK: sqabs.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32> %tmp1)
+ ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32>) nounwind readnone
+
+define <8 x i8> @sqneg_8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: sqneg_8b:
+;CHECK: sqneg.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> %tmp1)
+ ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @sqneg_16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: sqneg_16b:
+;CHECK: sqneg.16b
+ %tmp1 = load <16 x i8>* %A
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8> %tmp1)
+ ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @sqneg_4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: sqneg_4h:
+;CHECK: sqneg.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> %tmp1)
+ ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @sqneg_8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqneg_8h:
+;CHECK: sqneg.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> %tmp1)
+ ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @sqneg_2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: sqneg_2s:
+;CHECK: sqneg.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32> %tmp1)
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @sqneg_4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqneg_4s:
+;CHECK: sqneg.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32> %tmp1)
+ ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32>) nounwind readnone
+
+define <8 x i8> @abs_8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: abs_8b:
+;CHECK: abs.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8> %tmp1)
+ ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @abs_16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: abs_16b:
+;CHECK: abs.16b
+ %tmp1 = load <16 x i8>* %A
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8> %tmp1)
+ ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @abs_4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: abs_4h:
+;CHECK: abs.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16> %tmp1)
+ ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @abs_8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: abs_8h:
+;CHECK: abs.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16> %tmp1)
+ ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @abs_2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: abs_2s:
+;CHECK: abs.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32> %tmp1)
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @abs_4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: abs_4s:
+;CHECK: abs.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32> %tmp1)
+ ret <4 x i32> %tmp3
+}
+
+define <1 x i64> @abs_1d(<1 x i64> %A) nounwind {
+; CHECK-LABEL: abs_1d:
+; CHECK: abs d0, d0
+ %abs = call <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64> %A)
+ ret <1 x i64> %abs
+}
+
+define i64 @abs_1d_honestly(i64 %A) nounwind {
+; CHECK-LABEL: abs_1d_honestly:
+; CHECK: abs d0, d0
+ %abs = call i64 @llvm.aarch64.neon.abs.i64(i64 %A)
+ ret i64 %abs
+}
+
+declare <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64>) nounwind readnone
+declare i64 @llvm.aarch64.neon.abs.i64(i64) nounwind readnone
+
+define <8 x i16> @sabal8h(<8 x i8>* %A, <8 x i8>* %B, <8 x i16>* %C) nounwind {
+;CHECK-LABEL: sabal8h:
+;CHECK: sabal.8h
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = load <8 x i16>* %C
+ %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
+ %tmp5 = add <8 x i16> %tmp3, %tmp4.1
+ ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @sabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sabal4s:
+;CHECK: sabal.4s
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = load <4 x i32>* %C
+ %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
+ %tmp5 = add <4 x i32> %tmp3, %tmp4.1
+ ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @sabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sabal2d:
+;CHECK: sabal.2d
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = load <2 x i64>* %C
+ %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
+ %tmp4.1.1 = zext <2 x i32> %tmp4 to <2 x i64>
+ %tmp5 = add <2 x i64> %tmp3, %tmp4.1
+ ret <2 x i64> %tmp5
+}
+
+define <8 x i16> @sabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind {
+;CHECK-LABEL: sabal2_8h:
+;CHECK: sabal2.8h
+ %load1 = load <16 x i8>* %A
+ %load2 = load <16 x i8>* %B
+ %tmp3 = load <8 x i16>* %C
+ %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
+ %tmp5 = add <8 x i16> %tmp3, %tmp4.1
+ ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @sabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sabal2_4s:
+;CHECK: sabal2.4s
+ %load1 = load <8 x i16>* %A
+ %load2 = load <8 x i16>* %B
+ %tmp3 = load <4 x i32>* %C
+ %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
+ %tmp5 = add <4 x i32> %tmp3, %tmp4.1
+ ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @sabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sabal2_2d:
+;CHECK: sabal2.2d
+ %load1 = load <4 x i32>* %A
+ %load2 = load <4 x i32>* %B
+ %tmp3 = load <2 x i64>* %C
+ %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
+ %tmp5 = add <2 x i64> %tmp3, %tmp4.1
+ ret <2 x i64> %tmp5
+}
+
+define <8 x i16> @uabal8h(<8 x i8>* %A, <8 x i8>* %B, <8 x i16>* %C) nounwind {
+;CHECK-LABEL: uabal8h:
+;CHECK: uabal.8h
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = load <8 x i16>* %C
+ %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
+ %tmp5 = add <8 x i16> %tmp3, %tmp4.1
+ ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @uabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: uabal4s:
+;CHECK: uabal.4s
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = load <4 x i32>* %C
+ %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
+ %tmp5 = add <4 x i32> %tmp3, %tmp4.1
+ ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @uabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: uabal2d:
+;CHECK: uabal.2d
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = load <2 x i64>* %C
+ %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
+ %tmp5 = add <2 x i64> %tmp3, %tmp4.1
+ ret <2 x i64> %tmp5
+}
+
+define <8 x i16> @uabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind {
+;CHECK-LABEL: uabal2_8h:
+;CHECK: uabal2.8h
+ %load1 = load <16 x i8>* %A
+ %load2 = load <16 x i8>* %B
+ %tmp3 = load <8 x i16>* %C
+ %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
+ %tmp5 = add <8 x i16> %tmp3, %tmp4.1
+ ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @uabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: uabal2_4s:
+;CHECK: uabal2.4s
+ %load1 = load <8 x i16>* %A
+ %load2 = load <8 x i16>* %B
+ %tmp3 = load <4 x i32>* %C
+ %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
+ %tmp5 = add <4 x i32> %tmp3, %tmp4.1
+ ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @uabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: uabal2_2d:
+;CHECK: uabal2.2d
+ %load1 = load <4 x i32>* %A
+ %load2 = load <4 x i32>* %B
+ %tmp3 = load <2 x i64>* %C
+ %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
+ %tmp5 = add <2 x i64> %tmp3, %tmp4.1
+ ret <2 x i64> %tmp5
+}
+
+define <8 x i8> @saba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+;CHECK-LABEL: saba_8b:
+;CHECK: saba.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ %tmp4 = load <8 x i8>* %C
+ %tmp5 = add <8 x i8> %tmp3, %tmp4
+ ret <8 x i8> %tmp5
+}
+
+define <16 x i8> @saba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
+;CHECK-LABEL: saba_16b:
+;CHECK: saba.16b
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ %tmp4 = load <16 x i8>* %C
+ %tmp5 = add <16 x i8> %tmp3, %tmp4
+ ret <16 x i8> %tmp5
+}
+
+define <4 x i16> @saba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+;CHECK-LABEL: saba_4h:
+;CHECK: saba.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ %tmp4 = load <4 x i16>* %C
+ %tmp5 = add <4 x i16> %tmp3, %tmp4
+ ret <4 x i16> %tmp5
+}
+
+define <8 x i16> @saba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
+;CHECK-LABEL: saba_8h:
+;CHECK: saba.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ %tmp4 = load <8 x i16>* %C
+ %tmp5 = add <8 x i16> %tmp3, %tmp4
+ ret <8 x i16> %tmp5
+}
+
+define <2 x i32> @saba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+;CHECK-LABEL: saba_2s:
+;CHECK: saba.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ %tmp4 = load <2 x i32>* %C
+ %tmp5 = add <2 x i32> %tmp3, %tmp4
+ ret <2 x i32> %tmp5
+}
+
+define <4 x i32> @saba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: saba_4s:
+;CHECK: saba.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ %tmp4 = load <4 x i32>* %C
+ %tmp5 = add <4 x i32> %tmp3, %tmp4
+ ret <4 x i32> %tmp5
+}
+
+define <8 x i8> @uaba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+;CHECK-LABEL: uaba_8b:
+;CHECK: uaba.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ %tmp4 = load <8 x i8>* %C
+ %tmp5 = add <8 x i8> %tmp3, %tmp4
+ ret <8 x i8> %tmp5
+}
+
+define <16 x i8> @uaba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
+;CHECK-LABEL: uaba_16b:
+;CHECK: uaba.16b
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ %tmp4 = load <16 x i8>* %C
+ %tmp5 = add <16 x i8> %tmp3, %tmp4
+ ret <16 x i8> %tmp5
+}
+
+define <4 x i16> @uaba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+;CHECK-LABEL: uaba_4h:
+;CHECK: uaba.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ %tmp4 = load <4 x i16>* %C
+ %tmp5 = add <4 x i16> %tmp3, %tmp4
+ ret <4 x i16> %tmp5
+}
+
+define <8 x i16> @uaba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
+;CHECK-LABEL: uaba_8h:
+;CHECK: uaba.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ %tmp4 = load <8 x i16>* %C
+ %tmp5 = add <8 x i16> %tmp3, %tmp4
+ ret <8 x i16> %tmp5
+}
+
+define <2 x i32> @uaba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+;CHECK-LABEL: uaba_2s:
+;CHECK: uaba.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ %tmp4 = load <2 x i32>* %C
+ %tmp5 = add <2 x i32> %tmp3, %tmp4
+ ret <2 x i32> %tmp5
+}
+
+define <4 x i32> @uaba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: uaba_4s:
+;CHECK: uaba.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ %tmp4 = load <4 x i32>* %C
+ %tmp5 = add <4 x i32> %tmp3, %tmp4
+ ret <4 x i32> %tmp5
+}
+
+; Scalar FABD
+define float @fabds(float %a, float %b) nounwind {
+; CHECK-LABEL: fabds:
+; CHECK: fabd s0, s0, s1
+ %vabd.i = tail call float @llvm.aarch64.sisd.fabd.f32(float %a, float %b) nounwind
+ ret float %vabd.i
+}
+
+define double @fabdd(double %a, double %b) nounwind {
+; CHECK-LABEL: fabdd:
+; CHECK: fabd d0, d0, d1
+ %vabd.i = tail call double @llvm.aarch64.sisd.fabd.f64(double %a, double %b) nounwind
+ ret double %vabd.i
+}
+
+declare double @llvm.aarch64.sisd.fabd.f64(double, double) nounwind readnone
+declare float @llvm.aarch64.sisd.fabd.f32(float, float) nounwind readnone
+
+define <2 x i64> @uabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: uabdl_from_extract_dup:
+; CHECK-NOT: ext.16b
+; CHECK: uabdl2.2d
+ %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+ %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+ %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+ %res = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
+ %res1 = zext <2 x i32> %res to <2 x i64>
+ ret <2 x i64> %res1
+}
+
+define <2 x i64> @sabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: sabdl_from_extract_dup:
+; CHECK-NOT: ext.16b
+; CHECK: sabdl2.2d
+ %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+ %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+ %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+ %res = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
+ %res1 = zext <2 x i32> %res to <2 x i64>
+ ret <2 x i64> %res1
+}
diff --git a/test/CodeGen/AArch64/arm64-vadd.ll b/test/CodeGen/AArch64/arm64-vadd.ll
new file mode 100644
index 0000000..9ed8aa6
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vadd.ll
@@ -0,0 +1,941 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+
+define <8 x i8> @addhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: addhn8b:
+;CHECK: addhn.8b
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @addhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: addhn4h:
+;CHECK: addhn.4h
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @addhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: addhn2s:
+;CHECK: addhn.2s
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i64>* %B
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @addhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
+;CHECK-LABEL: addhn2_16b:
+;CHECK: addhn.8b
+;CHECK-NEXT: addhn2.16b
+ %vaddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+ %vaddhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+ %res = shufflevector <8 x i8> %vaddhn2.i, <8 x i8> %vaddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %res
+}
+
+define <8 x i16> @addhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
+;CHECK-LABEL: addhn2_8h:
+;CHECK: addhn.4h
+;CHECK-NEXT: addhn2.8h
+ %vaddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+ %vaddhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+ %res = shufflevector <4 x i16> %vaddhn2.i, <4 x i16> %vaddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i16> %res
+}
+
+define <4 x i32> @addhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind {
+;CHECK-LABEL: addhn2_4s:
+;CHECK: addhn.2s
+;CHECK-NEXT: addhn2.4s
+ %vaddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+ %vaddhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+ %res = shufflevector <2 x i32> %vaddhn2.i, <2 x i32> %vaddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x i32> %res
+}
+
+declare <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <8 x i8> @raddhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: raddhn8b:
+;CHECK: raddhn.8b
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @raddhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: raddhn4h:
+;CHECK: raddhn.4h
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @raddhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: raddhn2s:
+;CHECK: raddhn.2s
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i64>* %B
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @raddhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
+;CHECK-LABEL: raddhn2_16b:
+;CHECK: raddhn.8b
+;CHECK-NEXT: raddhn2.16b
+ %vraddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+ %vraddhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+ %res = shufflevector <8 x i8> %vraddhn2.i, <8 x i8> %vraddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %res
+}
+
+define <8 x i16> @raddhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
+;CHECK-LABEL: raddhn2_8h:
+;CHECK: raddhn.4h
+;CHECK-NEXT: raddhn2.8h
+ %vraddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+ %vraddhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+ %res = shufflevector <4 x i16> %vraddhn2.i, <4 x i16> %vraddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i16> %res
+}
+
+define <4 x i32> @raddhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind {
+;CHECK-LABEL: raddhn2_4s:
+;CHECK: raddhn.2s
+;CHECK-NEXT: raddhn2.4s
+ %vraddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+ %vraddhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+ %res = shufflevector <2 x i32> %vraddhn2.i, <2 x i32> %vraddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x i32> %res
+}
+
+declare <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @saddl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: saddl8h:
+;CHECK: saddl.8h
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
+ %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
+ %tmp5 = add <8 x i16> %tmp3, %tmp4
+ ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @saddl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: saddl4s:
+;CHECK: saddl.4s
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
+ %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
+ %tmp5 = add <4 x i32> %tmp3, %tmp4
+ ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @saddl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: saddl2d:
+;CHECK: saddl.2d
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
+ %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
+ %tmp5 = add <2 x i64> %tmp3, %tmp4
+ ret <2 x i64> %tmp5
+}
+
+define <8 x i16> @saddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind {
+; CHECK-LABEL: saddl2_8h:
+; CHECK-NEXT: saddl2.8h v0, v0, v1
+; CHECK-NEXT: ret
+ %tmp = bitcast <16 x i8> %a to <2 x i64>
+ %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+ %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
+ %vmovl.i.i.i = sext <8 x i8> %tmp1 to <8 x i16>
+ %tmp2 = bitcast <16 x i8> %b to <2 x i64>
+ %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+ %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <8 x i8>
+ %vmovl.i.i5.i = sext <8 x i8> %tmp3 to <8 x i16>
+ %add.i = add <8 x i16> %vmovl.i.i.i, %vmovl.i.i5.i
+ ret <8 x i16> %add.i
+}
+
+define <4 x i32> @saddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind {
+; CHECK-LABEL: saddl2_4s:
+; CHECK-NEXT: saddl2.4s v0, v0, v1
+; CHECK-NEXT: ret
+ %tmp = bitcast <8 x i16> %a to <2 x i64>
+ %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+ %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
+ %vmovl.i.i.i = sext <4 x i16> %tmp1 to <4 x i32>
+ %tmp2 = bitcast <8 x i16> %b to <2 x i64>
+ %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+ %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <4 x i16>
+ %vmovl.i.i5.i = sext <4 x i16> %tmp3 to <4 x i32>
+ %add.i = add <4 x i32> %vmovl.i.i.i, %vmovl.i.i5.i
+ ret <4 x i32> %add.i
+}
+
+define <2 x i64> @saddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind {
+; CHECK-LABEL: saddl2_2d:
+; CHECK-NEXT: saddl2.2d v0, v0, v1
+; CHECK-NEXT: ret
+ %tmp = bitcast <4 x i32> %a to <2 x i64>
+ %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+ %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
+ %vmovl.i.i.i = sext <2 x i32> %tmp1 to <2 x i64>
+ %tmp2 = bitcast <4 x i32> %b to <2 x i64>
+ %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+ %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <2 x i32>
+ %vmovl.i.i5.i = sext <2 x i32> %tmp3 to <2 x i64>
+ %add.i = add <2 x i64> %vmovl.i.i.i, %vmovl.i.i5.i
+ ret <2 x i64> %add.i
+}
+
+define <8 x i16> @uaddl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uaddl8h:
+;CHECK: uaddl.8h
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
+ %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
+ %tmp5 = add <8 x i16> %tmp3, %tmp4
+ ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @uaddl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uaddl4s:
+;CHECK: uaddl.4s
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
+ %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
+ %tmp5 = add <4 x i32> %tmp3, %tmp4
+ ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @uaddl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uaddl2d:
+;CHECK: uaddl.2d
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
+ %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
+ %tmp5 = add <2 x i64> %tmp3, %tmp4
+ ret <2 x i64> %tmp5
+}
+
+
+define <8 x i16> @uaddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind {
+; CHECK-LABEL: uaddl2_8h:
+; CHECK-NEXT: uaddl2.8h v0, v0, v1
+; CHECK-NEXT: ret
+ %tmp = bitcast <16 x i8> %a to <2 x i64>
+ %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+ %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
+ %vmovl.i.i.i = zext <8 x i8> %tmp1 to <8 x i16>
+ %tmp2 = bitcast <16 x i8> %b to <2 x i64>
+ %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+ %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <8 x i8>
+ %vmovl.i.i5.i = zext <8 x i8> %tmp3 to <8 x i16>
+ %add.i = add <8 x i16> %vmovl.i.i.i, %vmovl.i.i5.i
+ ret <8 x i16> %add.i
+}
+
+define <4 x i32> @uaddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind {
+; CHECK-LABEL: uaddl2_4s:
+; CHECK-NEXT: uaddl2.4s v0, v0, v1
+; CHECK-NEXT: ret
+ %tmp = bitcast <8 x i16> %a to <2 x i64>
+ %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+ %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
+ %vmovl.i.i.i = zext <4 x i16> %tmp1 to <4 x i32>
+ %tmp2 = bitcast <8 x i16> %b to <2 x i64>
+ %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+ %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <4 x i16>
+ %vmovl.i.i5.i = zext <4 x i16> %tmp3 to <4 x i32>
+ %add.i = add <4 x i32> %vmovl.i.i.i, %vmovl.i.i5.i
+ ret <4 x i32> %add.i
+}
+
+define <2 x i64> @uaddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind {
+; CHECK-LABEL: uaddl2_2d:
+; CHECK-NEXT: uaddl2.2d v0, v0, v1
+; CHECK-NEXT: ret
+ %tmp = bitcast <4 x i32> %a to <2 x i64>
+ %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+ %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
+ %vmovl.i.i.i = zext <2 x i32> %tmp1 to <2 x i64>
+ %tmp2 = bitcast <4 x i32> %b to <2 x i64>
+ %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+ %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <2 x i32>
+ %vmovl.i.i5.i = zext <2 x i32> %tmp3 to <2 x i64>
+ %add.i = add <2 x i64> %vmovl.i.i.i, %vmovl.i.i5.i
+ ret <2 x i64> %add.i
+}
+
+define <8 x i16> @uaddw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uaddw8h:
+;CHECK: uaddw.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = zext <8 x i8> %tmp2 to <8 x i16>
+ %tmp4 = add <8 x i16> %tmp1, %tmp3
+ ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @uaddw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uaddw4s:
+;CHECK: uaddw.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = zext <4 x i16> %tmp2 to <4 x i32>
+ %tmp4 = add <4 x i32> %tmp1, %tmp3
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @uaddw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uaddw2d:
+;CHECK: uaddw.2d
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = zext <2 x i32> %tmp2 to <2 x i64>
+ %tmp4 = add <2 x i64> %tmp1, %tmp3
+ ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @uaddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uaddw2_8h:
+;CHECK: uaddw2.8h
+ %tmp1 = load <8 x i16>* %A
+
+ %tmp2 = load <16 x i8>* %B
+ %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %ext2 = zext <8 x i8> %high2 to <8 x i16>
+
+ %res = add <8 x i16> %tmp1, %ext2
+ ret <8 x i16> %res
+}
+
+define <4 x i32> @uaddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uaddw2_4s:
+;CHECK: uaddw2.4s
+ %tmp1 = load <4 x i32>* %A
+
+ %tmp2 = load <8 x i16>* %B
+ %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %ext2 = zext <4 x i16> %high2 to <4 x i32>
+
+ %res = add <4 x i32> %tmp1, %ext2
+ ret <4 x i32> %res
+}
+
+define <2 x i64> @uaddw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uaddw2_2d:
+;CHECK: uaddw2.2d
+ %tmp1 = load <2 x i64>* %A
+
+ %tmp2 = load <4 x i32>* %B
+ %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ %ext2 = zext <2 x i32> %high2 to <2 x i64>
+
+ %res = add <2 x i64> %tmp1, %ext2
+ ret <2 x i64> %res
+}
+
+define <8 x i16> @saddw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: saddw8h:
+;CHECK: saddw.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = sext <8 x i8> %tmp2 to <8 x i16>
+ %tmp4 = add <8 x i16> %tmp1, %tmp3
+ ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @saddw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: saddw4s:
+;CHECK: saddw.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = sext <4 x i16> %tmp2 to <4 x i32>
+ %tmp4 = add <4 x i32> %tmp1, %tmp3
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @saddw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: saddw2d:
+;CHECK: saddw.2d
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = sext <2 x i32> %tmp2 to <2 x i64>
+ %tmp4 = add <2 x i64> %tmp1, %tmp3
+ ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @saddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: saddw2_8h:
+;CHECK: saddw2.8h
+ %tmp1 = load <8 x i16>* %A
+
+ %tmp2 = load <16 x i8>* %B
+ %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %ext2 = sext <8 x i8> %high2 to <8 x i16>
+
+ %res = add <8 x i16> %tmp1, %ext2
+ ret <8 x i16> %res
+}
+
+define <4 x i32> @saddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: saddw2_4s:
+;CHECK: saddw2.4s
+ %tmp1 = load <4 x i32>* %A
+
+ %tmp2 = load <8 x i16>* %B
+ %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %ext2 = sext <4 x i16> %high2 to <4 x i32>
+
+ %res = add <4 x i32> %tmp1, %ext2
+ ret <4 x i32> %res
+}
+
+define <2 x i64> @saddw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: saddw2_2d:
+;CHECK: saddw2.2d
+ %tmp1 = load <2 x i64>* %A
+
+ %tmp2 = load <4 x i32>* %B
+ %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ %ext2 = sext <2 x i32> %high2 to <2 x i64>
+
+ %res = add <2 x i64> %tmp1, %ext2
+ ret <2 x i64> %res
+}
+
+define <4 x i16> @saddlp4h(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: saddlp4h:
+;CHECK: saddlp.4h
+ %tmp1 = load <8 x i8>* %A
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @saddlp2s(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: saddlp2s:
+;CHECK: saddlp.2s
+ %tmp1 = load <4 x i16>* %A
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
+ ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @saddlp1d(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: saddlp1d:
+;CHECK: saddlp.1d
+ %tmp1 = load <2 x i32>* %A
+ %tmp3 = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> %tmp1)
+ ret <1 x i64> %tmp3
+}
+
+define <8 x i16> @saddlp8h(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: saddlp8h:
+;CHECK: saddlp.8h
+ %tmp1 = load <16 x i8>* %A
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @saddlp4s(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: saddlp4s:
+;CHECK: saddlp.4s
+ %tmp1 = load <8 x i16>* %A
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @saddlp2d(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: saddlp2d:
+;CHECK: saddlp.2d
+ %tmp1 = load <4 x i32>* %A
+ %tmp3 = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
+ ret <2 x i64> %tmp3
+}
+
+declare <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32>) nounwind readnone
+
+define <4 x i16> @uaddlp4h(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: uaddlp4h:
+;CHECK: uaddlp.4h
+ %tmp1 = load <8 x i8>* %A
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uaddlp2s(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: uaddlp2s:
+;CHECK: uaddlp.2s
+ %tmp1 = load <4 x i16>* %A
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
+ ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @uaddlp1d(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: uaddlp1d:
+;CHECK: uaddlp.1d
+ %tmp1 = load <2 x i32>* %A
+ %tmp3 = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> %tmp1)
+ ret <1 x i64> %tmp3
+}
+
+define <8 x i16> @uaddlp8h(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: uaddlp8h:
+;CHECK: uaddlp.8h
+ %tmp1 = load <16 x i8>* %A
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @uaddlp4s(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: uaddlp4s:
+;CHECK: uaddlp.4s
+ %tmp1 = load <8 x i16>* %A
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @uaddlp2d(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: uaddlp2d:
+;CHECK: uaddlp.2d
+ %tmp1 = load <4 x i32>* %A
+ %tmp3 = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
+ ret <2 x i64> %tmp3
+}
+
+declare <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32>) nounwind readnone
+
+define <4 x i16> @sadalp4h(<8 x i8>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sadalp4h:
+;CHECK: sadalp.4h
+ %tmp1 = load <8 x i8>* %A
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
+ %tmp4 = load <4 x i16>* %B
+ %tmp5 = add <4 x i16> %tmp3, %tmp4
+ ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @sadalp2s(<4 x i16>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sadalp2s:
+;CHECK: sadalp.2s
+ %tmp1 = load <4 x i16>* %A
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
+ %tmp4 = load <2 x i32>* %B
+ %tmp5 = add <2 x i32> %tmp3, %tmp4
+ ret <2 x i32> %tmp5
+}
+
+define <8 x i16> @sadalp8h(<16 x i8>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sadalp8h:
+;CHECK: sadalp.8h
+ %tmp1 = load <16 x i8>* %A
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
+ %tmp4 = load <8 x i16>* %B
+ %tmp5 = add <8 x i16> %tmp3, %tmp4
+ ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @sadalp4s(<8 x i16>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sadalp4s:
+;CHECK: sadalp.4s
+ %tmp1 = load <8 x i16>* %A
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
+ %tmp4 = load <4 x i32>* %B
+ %tmp5 = add <4 x i32> %tmp3, %tmp4
+ ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @sadalp2d(<4 x i32>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: sadalp2d:
+;CHECK: sadalp.2d
+ %tmp1 = load <4 x i32>* %A
+ %tmp3 = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
+ %tmp4 = load <2 x i64>* %B
+ %tmp5 = add <2 x i64> %tmp3, %tmp4
+ ret <2 x i64> %tmp5
+}
+
+define <4 x i16> @uadalp4h(<8 x i8>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uadalp4h:
+;CHECK: uadalp.4h
+ %tmp1 = load <8 x i8>* %A
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
+ %tmp4 = load <4 x i16>* %B
+ %tmp5 = add <4 x i16> %tmp3, %tmp4
+ ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @uadalp2s(<4 x i16>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uadalp2s:
+;CHECK: uadalp.2s
+ %tmp1 = load <4 x i16>* %A
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
+ %tmp4 = load <2 x i32>* %B
+ %tmp5 = add <2 x i32> %tmp3, %tmp4
+ ret <2 x i32> %tmp5
+}
+
+define <8 x i16> @uadalp8h(<16 x i8>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uadalp8h:
+;CHECK: uadalp.8h
+ %tmp1 = load <16 x i8>* %A
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
+ %tmp4 = load <8 x i16>* %B
+ %tmp5 = add <8 x i16> %tmp3, %tmp4
+ ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @uadalp4s(<8 x i16>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uadalp4s:
+;CHECK: uadalp.4s
+ %tmp1 = load <8 x i16>* %A
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
+ %tmp4 = load <4 x i32>* %B
+ %tmp5 = add <4 x i32> %tmp3, %tmp4
+ ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @uadalp2d(<4 x i32>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: uadalp2d:
+;CHECK: uadalp.2d
+ %tmp1 = load <4 x i32>* %A
+ %tmp3 = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
+ %tmp4 = load <2 x i64>* %B
+ %tmp5 = add <2 x i64> %tmp3, %tmp4
+ ret <2 x i64> %tmp5
+}
+
+define <8 x i8> @addp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: addp_8b:
+;CHECK: addp.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @addp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: addp_16b:
+;CHECK: addp.16b
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @addp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: addp_4h:
+;CHECK: addp.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @addp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: addp_8h:
+;CHECK: addp.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @addp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: addp_2s:
+;CHECK: addp.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @addp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: addp_4s:
+;CHECK: addp.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @addp_2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: addp_2d:
+;CHECK: addp.2d
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i64>* %B
+ %tmp3 = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+ ret <2 x i64> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <2 x float> @faddp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: faddp_2s:
+;CHECK: faddp.2s
+ %tmp1 = load <2 x float>* %A
+ %tmp2 = load <2 x float>* %B
+ %tmp3 = call <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+ ret <2 x float> %tmp3
+}
+
+define <4 x float> @faddp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: faddp_4s:
+;CHECK: faddp.4s
+ %tmp1 = load <4 x float>* %A
+ %tmp2 = load <4 x float>* %B
+ %tmp3 = call <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+ ret <4 x float> %tmp3
+}
+
+define <2 x double> @faddp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: faddp_2d:
+;CHECK: faddp.2d
+ %tmp1 = load <2 x double>* %A
+ %tmp2 = load <2 x double>* %B
+ %tmp3 = call <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+ ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x i64> @uaddl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: uaddl2_duprhs
+; CHECK-NOT: ext.16b
+; CHECK: uaddl2.2d
+ %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+ %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+ %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+ %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
+ %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
+
+ %res = add <2 x i64> %lhs.ext, %rhs.ext
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @saddl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: saddl2_duplhs
+; CHECK-NOT: ext.16b
+; CHECK: saddl2.2d
+ %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
+ %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
+
+ %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+ %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
+ %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
+
+ %res = add <2 x i64> %lhs.ext, %rhs.ext
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @usubl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: usubl2_duprhs
+; CHECK-NOT: ext.16b
+; CHECK: usubl2.2d
+ %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+ %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+ %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+ %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
+ %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
+
+ %res = sub <2 x i64> %lhs.ext, %rhs.ext
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @ssubl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: ssubl2_duplhs
+; CHECK-NOT: ext.16b
+; CHECK: ssubl2.2d
+ %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
+ %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
+
+ %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+ %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
+ %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
+
+ %res = sub <2 x i64> %lhs.ext, %rhs.ext
+ ret <2 x i64> %res
+}
+
+define <8 x i8> @addhn8b_natural(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: addhn8b_natural:
+;CHECK: addhn.8b
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %sum = add <8 x i16> %tmp1, %tmp2
+ %high_bits = lshr <8 x i16> %sum, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+ %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
+ ret <8 x i8> %narrowed
+}
+
+define <4 x i16> @addhn4h_natural(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: addhn4h_natural:
+;CHECK: addhn.4h
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %sum = add <4 x i32> %tmp1, %tmp2
+ %high_bits = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
+ %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
+ ret <4 x i16> %narrowed
+}
+
+define <2 x i32> @addhn2s_natural(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: addhn2s_natural:
+;CHECK: addhn.2s
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i64>* %B
+ %sum = add <2 x i64> %tmp1, %tmp2
+ %high_bits = lshr <2 x i64> %sum, <i64 32, i64 32>
+ %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
+ ret <2 x i32> %narrowed
+}
+
+define <16 x i8> @addhn2_16b_natural(<8 x i8> %low, <8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: addhn2_16b_natural:
+;CHECK: addhn2.16b
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %sum = add <8 x i16> %tmp1, %tmp2
+ %high_bits = lshr <8 x i16> %sum, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+ %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
+ %res = shufflevector <8 x i8> %low, <8 x i8> %narrowed, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %res
+}
+
+define <8 x i16> @addhn2_8h_natural(<4 x i16> %low, <4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: addhn2_8h_natural:
+;CHECK: addhn2.8h
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %sum = add <4 x i32> %tmp1, %tmp2
+ %high_bits = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
+ %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
+ %res = shufflevector <4 x i16> %low, <4 x i16> %narrowed, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i16> %res
+}
+
+define <4 x i32> @addhn2_4s_natural(<2 x i32> %low, <2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: addhn2_4s_natural:
+;CHECK: addhn2.4s
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i64>* %B
+ %sum = add <2 x i64> %tmp1, %tmp2
+ %high_bits = lshr <2 x i64> %sum, <i64 32, i64 32>
+ %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
+ %res = shufflevector <2 x i32> %low, <2 x i32> %narrowed, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x i32> %res
+}
+
+define <8 x i8> @subhn8b_natural(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: subhn8b_natural:
+;CHECK: subhn.8b
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %diff = sub <8 x i16> %tmp1, %tmp2
+ %high_bits = lshr <8 x i16> %diff, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+ %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
+ ret <8 x i8> %narrowed
+}
+
+define <4 x i16> @subhn4h_natural(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: subhn4h_natural:
+;CHECK: subhn.4h
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %diff = sub <4 x i32> %tmp1, %tmp2
+ %high_bits = lshr <4 x i32> %diff, <i32 16, i32 16, i32 16, i32 16>
+ %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
+ ret <4 x i16> %narrowed
+}
+
+define <2 x i32> @subhn2s_natural(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: subhn2s_natural:
+;CHECK: subhn.2s
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i64>* %B
+ %diff = sub <2 x i64> %tmp1, %tmp2
+ %high_bits = lshr <2 x i64> %diff, <i64 32, i64 32>
+ %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
+ ret <2 x i32> %narrowed
+}
+
+define <16 x i8> @subhn2_16b_natural(<8 x i8> %low, <8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: subhn2_16b_natural:
+;CHECK: subhn2.16b
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %diff = sub <8 x i16> %tmp1, %tmp2
+ %high_bits = lshr <8 x i16> %diff, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+ %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
+ %res = shufflevector <8 x i8> %low, <8 x i8> %narrowed, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %res
+}
+
+define <8 x i16> @subhn2_8h_natural(<4 x i16> %low, <4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: subhn2_8h_natural:
+;CHECK: subhn2.8h
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %diff = sub <4 x i32> %tmp1, %tmp2
+ %high_bits = lshr <4 x i32> %diff, <i32 16, i32 16, i32 16, i32 16>
+ %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
+ %res = shufflevector <4 x i16> %low, <4 x i16> %narrowed, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i16> %res
+}
+
+define <4 x i32> @subhn2_4s_natural(<2 x i32> %low, <2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: subhn2_4s_natural:
+;CHECK: subhn2.4s
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i64>* %B
+ %diff = sub <2 x i64> %tmp1, %tmp2
+ %high_bits = lshr <2 x i64> %diff, <i64 32, i64 32>
+ %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
+ %res = shufflevector <2 x i32> %low, <2 x i32> %narrowed, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x i32> %res
+}
diff --git a/test/CodeGen/AArch64/arm64-vaddlv.ll b/test/CodeGen/AArch64/arm64-vaddlv.ll
new file mode 100644
index 0000000..2d64138
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vaddlv.ll
@@ -0,0 +1,26 @@
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s
+
+define i64 @test_vaddlv_s32(<2 x i32> %a1) nounwind readnone {
+; CHECK: test_vaddlv_s32
+; CHECK: saddlp.1d v[[REGNUM:[0-9]+]], v[[INREG:[0-9]+]]
+; CHECK-NEXT: fmov x[[OUTREG:[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+ %vaddlv.i = tail call i64 @llvm.aarch64.neon.saddlv.i64.v2i32(<2 x i32> %a1) nounwind
+ ret i64 %vaddlv.i
+}
+
+define i64 @test_vaddlv_u32(<2 x i32> %a1) nounwind readnone {
+; CHECK: test_vaddlv_u32
+; CHECK: uaddlp.1d v[[REGNUM:[0-9]+]], v[[INREG:[0-9]+]]
+; CHECK-NEXT: fmov x[[OUTREG:[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+ %vaddlv.i = tail call i64 @llvm.aarch64.neon.uaddlv.i64.v2i32(<2 x i32> %a1) nounwind
+ ret i64 %vaddlv.i
+}
+
+declare i64 @llvm.aarch64.neon.uaddlv.i64.v2i32(<2 x i32>) nounwind readnone
+
+declare i64 @llvm.aarch64.neon.saddlv.i64.v2i32(<2 x i32>) nounwind readnone
+
diff --git a/test/CodeGen/AArch64/arm64-vaddv.ll b/test/CodeGen/AArch64/arm64-vaddv.ll
new file mode 100644
index 0000000..2d92ce6
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vaddv.ll
@@ -0,0 +1,245 @@
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s -mcpu=cyclone | FileCheck %s
+
+define signext i8 @test_vaddv_s8(<8 x i8> %a1) {
+; CHECK-LABEL: test_vaddv_s8:
+; CHECK: addv.8b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+ %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8> %a1)
+ %0 = trunc i32 %vaddv.i to i8
+ ret i8 %0
+}
+
+define signext i16 @test_vaddv_s16(<4 x i16> %a1) {
+; CHECK-LABEL: test_vaddv_s16:
+; CHECK: addv.4h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+ %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16> %a1)
+ %0 = trunc i32 %vaddv.i to i16
+ ret i16 %0
+}
+
+define i32 @test_vaddv_s32(<2 x i32> %a1) {
+; CHECK-LABEL: test_vaddv_s32:
+; 2 x i32 is not supported by the ISA, thus, this is a special case
+; CHECK: addp.2s v[[REGNUM:[0-9]+]], v0, v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+ %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32> %a1)
+ ret i32 %vaddv.i
+}
+
+define i64 @test_vaddv_s64(<2 x i64> %a1) {
+; CHECK-LABEL: test_vaddv_s64:
+; CHECK: addp.2d [[REGNUM:d[0-9]+]], v0
+; CHECK-NEXT: fmov x0, [[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+ %vaddv.i = tail call i64 @llvm.aarch64.neon.saddv.i64.v2i64(<2 x i64> %a1)
+ ret i64 %vaddv.i
+}
+
+define zeroext i8 @test_vaddv_u8(<8 x i8> %a1) {
+; CHECK-LABEL: test_vaddv_u8:
+; CHECK: addv.8b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+ %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8> %a1)
+ %0 = trunc i32 %vaddv.i to i8
+ ret i8 %0
+}
+
+define i32 @test_vaddv_u8_masked(<8 x i8> %a1) {
+; CHECK-LABEL: test_vaddv_u8_masked:
+; CHECK: addv.8b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+ %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8> %a1)
+ %0 = and i32 %vaddv.i, 511 ; 0x1ff
+ ret i32 %0
+}
+
+define zeroext i16 @test_vaddv_u16(<4 x i16> %a1) {
+; CHECK-LABEL: test_vaddv_u16:
+; CHECK: addv.4h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+ %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16> %a1)
+ %0 = trunc i32 %vaddv.i to i16
+ ret i16 %0
+}
+
+define i32 @test_vaddv_u16_masked(<4 x i16> %a1) {
+; CHECK-LABEL: test_vaddv_u16_masked:
+; CHECK: addv.4h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+ %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16> %a1)
+ %0 = and i32 %vaddv.i, 3276799 ; 0x31ffff
+ ret i32 %0
+}
+
+define i32 @test_vaddv_u32(<2 x i32> %a1) {
+; CHECK-LABEL: test_vaddv_u32:
+; 2 x i32 is not supported by the ISA, thus, this is a special case
+; CHECK: addp.2s v[[REGNUM:[0-9]+]], v0, v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+ %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v2i32(<2 x i32> %a1)
+ ret i32 %vaddv.i
+}
+
+define float @test_vaddv_f32(<2 x float> %a1) {
+; CHECK-LABEL: test_vaddv_f32:
+; CHECK: faddp.2s s0, v0
+; CHECK-NEXT: ret
+entry:
+ %vaddv.i = tail call float @llvm.aarch64.neon.faddv.f32.v2f32(<2 x float> %a1)
+ ret float %vaddv.i
+}
+
+define float @test_vaddv_v4f32(<4 x float> %a1) {
+; CHECK-LABEL: test_vaddv_v4f32:
+; CHECK: faddp.4s [[REGNUM:v[0-9]+]], v0, v0
+; CHECK: faddp.2s s0, [[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+ %vaddv.i = tail call float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> %a1)
+ ret float %vaddv.i
+}
+
+define double @test_vaddv_f64(<2 x double> %a1) {
+; CHECK-LABEL: test_vaddv_f64:
+; CHECK: faddp.2d d0, v0
+; CHECK-NEXT: ret
+entry:
+ %vaddv.i = tail call double @llvm.aarch64.neon.faddv.f64.v2f64(<2 x double> %a1)
+ ret double %vaddv.i
+}
+
+define i64 @test_vaddv_u64(<2 x i64> %a1) {
+; CHECK-LABEL: test_vaddv_u64:
+; CHECK: addp.2d [[REGNUM:d[0-9]+]], v0
+; CHECK-NEXT: fmov x0, [[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+ %vaddv.i = tail call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> %a1)
+ ret i64 %vaddv.i
+}
+
+define <1 x i64> @test_vaddv_u64_to_vec(<2 x i64> %a1) {
+; CHECK-LABEL: test_vaddv_u64_to_vec:
+; CHECK: addp.2d d0, v0
+; CHECK-NOT: fmov
+; CHECK-NOT: ins
+; CHECK: ret
+entry:
+ %vaddv.i = tail call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> %a1)
+ %vec = insertelement <1 x i64> undef, i64 %vaddv.i, i32 0
+ ret <1 x i64> %vec
+}
+
+define signext i8 @test_vaddvq_s8(<16 x i8> %a1) {
+; CHECK-LABEL: test_vaddvq_s8:
+; CHECK: addv.16b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+ %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8> %a1)
+ %0 = trunc i32 %vaddv.i to i8
+ ret i8 %0
+}
+
+define signext i16 @test_vaddvq_s16(<8 x i16> %a1) {
+; CHECK-LABEL: test_vaddvq_s16:
+; CHECK: addv.8h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+ %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16> %a1)
+ %0 = trunc i32 %vaddv.i to i16
+ ret i16 %0
+}
+
+define i32 @test_vaddvq_s32(<4 x i32> %a1) {
+; CHECK-LABEL: test_vaddvq_s32:
+; CHECK: addv.4s [[REGNUM:s[0-9]+]], v0
+; CHECK-NEXT: fmov w0, [[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+ %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32> %a1)
+ ret i32 %vaddv.i
+}
+
+define zeroext i8 @test_vaddvq_u8(<16 x i8> %a1) {
+; CHECK-LABEL: test_vaddvq_u8:
+; CHECK: addv.16b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+ %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v16i8(<16 x i8> %a1)
+ %0 = trunc i32 %vaddv.i to i8
+ ret i8 %0
+}
+
+define zeroext i16 @test_vaddvq_u16(<8 x i16> %a1) {
+; CHECK-LABEL: test_vaddvq_u16:
+; CHECK: addv.8h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+ %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i16(<8 x i16> %a1)
+ %0 = trunc i32 %vaddv.i to i16
+ ret i16 %0
+}
+
+define i32 @test_vaddvq_u32(<4 x i32> %a1) {
+; CHECK-LABEL: test_vaddvq_u32:
+; CHECK: addv.4s [[REGNUM:s[0-9]+]], v0
+; CHECK-NEXT: fmov [[FMOVRES:w[0-9]+]], [[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+ %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32> %a1)
+ ret i32 %vaddv.i
+}
+
+declare i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32>)
+
+declare i32 @llvm.aarch64.neon.uaddv.i32.v8i16(<8 x i16>)
+
+declare i32 @llvm.aarch64.neon.uaddv.i32.v16i8(<16 x i8>)
+
+declare i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32>)
+
+declare i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16>)
+
+declare i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8>)
+
+declare i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64>)
+
+declare i32 @llvm.aarch64.neon.uaddv.i32.v2i32(<2 x i32>)
+
+declare i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16>)
+
+declare i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8>)
+
+declare i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32>)
+
+declare i64 @llvm.aarch64.neon.saddv.i64.v2i64(<2 x i64>)
+
+declare i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16>)
+
+declare i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8>)
+
+declare float @llvm.aarch64.neon.faddv.f32.v2f32(<2 x float> %a1)
+declare float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> %a1)
+declare double @llvm.aarch64.neon.faddv.f64.v2f64(<2 x double> %a1)
diff --git a/test/CodeGen/AArch64/arm64-variadic-aapcs.ll b/test/CodeGen/AArch64/arm64-variadic-aapcs.ll
new file mode 100644
index 0000000..36a7bfd
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-variadic-aapcs.ll
@@ -0,0 +1,143 @@
+; RUN: llc -verify-machineinstrs -mtriple=arm64-linux-gnu -pre-RA-sched=linearize -enable-misched=false < %s | FileCheck %s
+
+%va_list = type {i8*, i8*, i8*, i32, i32}
+
+@var = global %va_list zeroinitializer, align 8
+
+declare void @llvm.va_start(i8*)
+
+define void @test_simple(i32 %n, ...) {
+; CHECK-LABEL: test_simple:
+; CHECK: sub sp, sp, #[[STACKSIZE:[0-9]+]]
+; CHECK: add [[STACK_TOP:x[0-9]+]], sp, #[[STACKSIZE]]
+
+; CHECK: adrp x[[VA_LIST_HI:[0-9]+]], var
+
+; CHECK: stp x1, x2, [sp, #[[GR_BASE:[0-9]+]]]
+; ... omit middle ones ...
+; CHECK: str x7, [sp, #
+
+; CHECK: stp q0, q1, [sp]
+; ... omit middle ones ...
+; CHECK: stp q6, q7, [sp, #
+
+; CHECK: str [[STACK_TOP]], [x[[VA_LIST_HI]], :lo12:var]
+
+; CHECK: add [[GR_TOPTMP:x[0-9]+]], sp, #[[GR_BASE]]
+; CHECK: add [[GR_TOP:x[0-9]+]], [[GR_TOPTMP]], #56
+; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, :lo12:var
+; CHECK: str [[GR_TOP]], [x[[VA_LIST]], #8]
+
+; CHECK: mov [[VR_TOPTMP:x[0-9]+]], sp
+; CHECK: add [[VR_TOP:x[0-9]+]], [[VR_TOPTMP]], #128
+; CHECK: str [[VR_TOP]], [x[[VA_LIST]], #16]
+
+; CHECK: movn [[GR_OFFS:w[0-9]+]], #0x37
+; CHECK: str [[GR_OFFS]], [x[[VA_LIST]], #24]
+
+; CHECK: orr [[VR_OFFS:w[0-9]+]], wzr, #0xffffff80
+; CHECK: str [[VR_OFFS]], [x[[VA_LIST]], #28]
+
+ %addr = bitcast %va_list* @var to i8*
+ call void @llvm.va_start(i8* %addr)
+
+ ret void
+}
+
+define void @test_fewargs(i32 %n, i32 %n1, i32 %n2, float %m, ...) {
+; CHECK-LABEL: test_fewargs:
+; CHECK: sub sp, sp, #[[STACKSIZE:[0-9]+]]
+; CHECK: add [[STACK_TOP:x[0-9]+]], sp, #[[STACKSIZE]]
+
+; CHECK: adrp x[[VA_LIST_HI:[0-9]+]], var
+
+; CHECK: stp x3, x4, [sp, #[[GR_BASE:[0-9]+]]]
+; ... omit middle ones ...
+; CHECK: str x7, [sp, #
+
+; CHECK: stp q1, q2, [sp]
+; ... omit middle ones ...
+; CHECK: str q7, [sp, #
+
+; CHECK: str [[STACK_TOP]], [x[[VA_LIST_HI]], :lo12:var]
+
+; CHECK: add [[GR_TOPTMP:x[0-9]+]], sp, #[[GR_BASE]]
+; CHECK: add [[GR_TOP:x[0-9]+]], [[GR_TOPTMP]], #40
+; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, :lo12:var
+; CHECK: str [[GR_TOP]], [x[[VA_LIST]], #8]
+
+; CHECK: mov [[VR_TOPTMP:x[0-9]+]], sp
+; CHECK: add [[VR_TOP:x[0-9]+]], [[VR_TOPTMP]], #112
+; CHECK: str [[VR_TOP]], [x[[VA_LIST]], #16]
+
+; CHECK: movn [[GR_OFFS:w[0-9]+]], #0x27
+; CHECK: str [[GR_OFFS]], [x[[VA_LIST]], #24]
+
+; CHECK: movn [[VR_OFFS:w[0-9]+]], #0x6f
+; CHECK: str [[VR_OFFS]], [x[[VA_LIST]], #28]
+
+ %addr = bitcast %va_list* @var to i8*
+ call void @llvm.va_start(i8* %addr)
+
+ ret void
+}
+
+define void @test_nospare([8 x i64], [8 x float], ...) {
+; CHECK-LABEL: test_nospare:
+
+ %addr = bitcast %va_list* @var to i8*
+ call void @llvm.va_start(i8* %addr)
+; CHECK-NOT: sub sp, sp
+; CHECK: mov [[STACK:x[0-9]+]], sp
+; CHECK: str [[STACK]], [{{x[0-9]+}}, :lo12:var]
+
+ ret void
+}
+
+; If there are non-variadic arguments on the stack (here two i64s) then the
+; __stack field should point just past them.
+define void @test_offsetstack([10 x i64], [3 x float], ...) {
+; CHECK-LABEL: test_offsetstack:
+; CHECK: sub sp, sp, #80
+; CHECK: add [[STACK_TOP:x[0-9]+]], sp, #96
+; CHECK: str [[STACK_TOP]], [{{x[0-9]+}}, :lo12:var]
+
+ %addr = bitcast %va_list* @var to i8*
+ call void @llvm.va_start(i8* %addr)
+ ret void
+}
+
+declare void @llvm.va_end(i8*)
+
+define void @test_va_end() nounwind {
+; CHECK-LABEL: test_va_end:
+; CHECK-NEXT: BB#0
+
+ %addr = bitcast %va_list* @var to i8*
+ call void @llvm.va_end(i8* %addr)
+
+ ret void
+; CHECK-NEXT: ret
+}
+
+declare void @llvm.va_copy(i8* %dest, i8* %src)
+
+@second_list = global %va_list zeroinitializer
+
+define void @test_va_copy() {
+; CHECK-LABEL: test_va_copy:
+ %srcaddr = bitcast %va_list* @var to i8*
+ %dstaddr = bitcast %va_list* @second_list to i8*
+ call void @llvm.va_copy(i8* %dstaddr, i8* %srcaddr)
+
+; CHECK: add x[[SRC:[0-9]+]], {{x[0-9]+}}, :lo12:var
+
+; CHECK: ldr [[BLOCK:q[0-9]+]], [x[[SRC]]]
+; CHECK: add x[[DST:[0-9]+]], {{x[0-9]+}}, :lo12:second_list
+; CHECK: str [[BLOCK]], [x[[DST]]]
+
+; CHECK: ldr [[BLOCK:q[0-9]+]], [x[[SRC]], #16]
+; CHECK: str [[BLOCK]], [x[[DST]], #16]
+ ret void
+; CHECK: ret
+}
diff --git a/test/CodeGen/AArch64/arm64-vbitwise.ll b/test/CodeGen/AArch64/arm64-vbitwise.ll
new file mode 100644
index 0000000..93de95e
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vbitwise.ll
@@ -0,0 +1,91 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @rbit_8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: rbit_8b:
+;CHECK: rbit.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8> %tmp1)
+ ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @rbit_16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: rbit_16b:
+;CHECK: rbit.16b
+ %tmp1 = load <16 x i8>* %A
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8> %tmp1)
+ ret <16 x i8> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8>) nounwind readnone
+
+define <8 x i16> @sxtl8h(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: sxtl8h:
+;CHECK: sshll.8h
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = sext <8 x i8> %tmp1 to <8 x i16>
+ ret <8 x i16> %tmp2
+}
+
+define <8 x i16> @uxtl8h(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: uxtl8h:
+;CHECK: ushll.8h
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = zext <8 x i8> %tmp1 to <8 x i16>
+ ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @sxtl4s(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: sxtl4s:
+;CHECK: sshll.4s
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = sext <4 x i16> %tmp1 to <4 x i32>
+ ret <4 x i32> %tmp2
+}
+
+define <4 x i32> @uxtl4s(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: uxtl4s:
+;CHECK: ushll.4s
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = zext <4 x i16> %tmp1 to <4 x i32>
+ ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @sxtl2d(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: sxtl2d:
+;CHECK: sshll.2d
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = sext <2 x i32> %tmp1 to <2 x i64>
+ ret <2 x i64> %tmp2
+}
+
+define <2 x i64> @uxtl2d(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: uxtl2d:
+;CHECK: ushll.2d
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = zext <2 x i32> %tmp1 to <2 x i64>
+ ret <2 x i64> %tmp2
+}
+
+; Check for incorrect use of vector bic.
+; rdar://11553859
+define void @test_vsliq(i8* nocapture %src, i8* nocapture %dest) nounwind noinline ssp {
+entry:
+; CHECK-LABEL: test_vsliq:
+; CHECK-NOT: bic
+; CHECK: movi.2d [[REG1:v[0-9]+]], #0x0000ff000000ff
+; CHECK: and.16b v{{[0-9]+}}, v{{[0-9]+}}, [[REG1]]
+ %0 = bitcast i8* %src to <16 x i8>*
+ %1 = load <16 x i8>* %0, align 16
+ %and.i = and <16 x i8> %1, <i8 -1, i8 0, i8 0, i8 0, i8 -1, i8 0, i8 0, i8 0, i8 -1, i8 0, i8 0, i8 0, i8 -1, i8 0, i8 0, i8 0>
+ %2 = bitcast <16 x i8> %and.i to <8 x i16>
+ %vshl_n = shl <8 x i16> %2, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+ %3 = or <8 x i16> %2, %vshl_n
+ %4 = bitcast <8 x i16> %3 to <4 x i32>
+ %vshl_n8 = shl <4 x i32> %4, <i32 16, i32 16, i32 16, i32 16>
+ %5 = or <4 x i32> %4, %vshl_n8
+ %6 = bitcast <4 x i32> %5 to <16 x i8>
+ %7 = bitcast i8* %dest to <16 x i8>*
+ store <16 x i8> %6, <16 x i8>* %7, align 16
+ ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-vclz.ll b/test/CodeGen/AArch64/arm64-vclz.ll
new file mode 100644
index 0000000..cf5670a
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vclz.ll
@@ -0,0 +1,109 @@
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s
+
+define <8 x i8> @test_vclz_u8(<8 x i8> %a) nounwind readnone ssp {
+ ; CHECK-LABEL: test_vclz_u8:
+ ; CHECK: clz.8b v0, v0
+ ; CHECK-NEXT: ret
+ %vclz.i = tail call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) nounwind
+ ret <8 x i8> %vclz.i
+}
+
+define <8 x i8> @test_vclz_s8(<8 x i8> %a) nounwind readnone ssp {
+ ; CHECK-LABEL: test_vclz_s8:
+ ; CHECK: clz.8b v0, v0
+ ; CHECK-NEXT: ret
+ %vclz.i = tail call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) nounwind
+ ret <8 x i8> %vclz.i
+}
+
+define <4 x i16> @test_vclz_u16(<4 x i16> %a) nounwind readnone ssp {
+ ; CHECK-LABEL: test_vclz_u16:
+ ; CHECK: clz.4h v0, v0
+ ; CHECK-NEXT: ret
+ %vclz1.i = tail call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false) nounwind
+ ret <4 x i16> %vclz1.i
+}
+
+define <4 x i16> @test_vclz_s16(<4 x i16> %a) nounwind readnone ssp {
+ ; CHECK-LABEL: test_vclz_s16:
+ ; CHECK: clz.4h v0, v0
+ ; CHECK-NEXT: ret
+ %vclz1.i = tail call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false) nounwind
+ ret <4 x i16> %vclz1.i
+}
+
+define <2 x i32> @test_vclz_u32(<2 x i32> %a) nounwind readnone ssp {
+ ; CHECK-LABEL: test_vclz_u32:
+ ; CHECK: clz.2s v0, v0
+ ; CHECK-NEXT: ret
+ %vclz1.i = tail call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) nounwind
+ ret <2 x i32> %vclz1.i
+}
+
+define <2 x i32> @test_vclz_s32(<2 x i32> %a) nounwind readnone ssp {
+ ; CHECK-LABEL: test_vclz_s32:
+ ; CHECK: clz.2s v0, v0
+ ; CHECK-NEXT: ret
+ %vclz1.i = tail call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) nounwind
+ ret <2 x i32> %vclz1.i
+}
+
+define <16 x i8> @test_vclzq_u8(<16 x i8> %a) nounwind readnone ssp {
+ ; CHECK-LABEL: test_vclzq_u8:
+ ; CHECK: clz.16b v0, v0
+ ; CHECK-NEXT: ret
+ %vclz.i = tail call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) nounwind
+ ret <16 x i8> %vclz.i
+}
+
+define <16 x i8> @test_vclzq_s8(<16 x i8> %a) nounwind readnone ssp {
+ ; CHECK-LABEL: test_vclzq_s8:
+ ; CHECK: clz.16b v0, v0
+ ; CHECK-NEXT: ret
+ %vclz.i = tail call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) nounwind
+ ret <16 x i8> %vclz.i
+}
+
+define <8 x i16> @test_vclzq_u16(<8 x i16> %a) nounwind readnone ssp {
+ ; CHECK-LABEL: test_vclzq_u16:
+ ; CHECK: clz.8h v0, v0
+ ; CHECK-NEXT: ret
+ %vclz1.i = tail call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false) nounwind
+ ret <8 x i16> %vclz1.i
+}
+
+define <8 x i16> @test_vclzq_s16(<8 x i16> %a) nounwind readnone ssp {
+ ; CHECK-LABEL: test_vclzq_s16:
+ ; CHECK: clz.8h v0, v0
+ ; CHECK-NEXT: ret
+ %vclz1.i = tail call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false) nounwind
+ ret <8 x i16> %vclz1.i
+}
+
+define <4 x i32> @test_vclzq_u32(<4 x i32> %a) nounwind readnone ssp {
+ ; CHECK-LABEL: test_vclzq_u32:
+ ; CHECK: clz.4s v0, v0
+ ; CHECK-NEXT: ret
+ %vclz1.i = tail call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) nounwind
+ ret <4 x i32> %vclz1.i
+}
+
+define <4 x i32> @test_vclzq_s32(<4 x i32> %a) nounwind readnone ssp {
+ ; CHECK-LABEL: test_vclzq_s32:
+ ; CHECK: clz.4s v0, v0
+ ; CHECK-NEXT: ret
+ %vclz1.i = tail call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) nounwind
+ ret <4 x i32> %vclz1.i
+}
+
+declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone
+
+declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1) nounwind readnone
+
+declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>, i1) nounwind readnone
+
+declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
+
+declare <4 x i16> @llvm.ctlz.v4i16(<4 x i16>, i1) nounwind readnone
+
+declare <8 x i8> @llvm.ctlz.v8i8(<8 x i8>, i1) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-vcmp.ll b/test/CodeGen/AArch64/arm64-vcmp.ll
new file mode 100644
index 0000000..982ab09
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vcmp.ll
@@ -0,0 +1,236 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+
+define void @fcmltz_4s(<4 x float> %a, <4 x i16>* %p) nounwind {
+;CHECK-LABEL: fcmltz_4s:
+;CHECK: fcmlt.4s [[REG:v[0-9]+]], v0, #0
+;CHECK-NEXT: xtn.4h v[[REG_1:[0-9]+]], [[REG]]
+;CHECK-NEXT: str d[[REG_1]], [x0]
+;CHECK-NEXT: ret
+ %tmp = fcmp olt <4 x float> %a, zeroinitializer
+ %tmp2 = sext <4 x i1> %tmp to <4 x i16>
+ store <4 x i16> %tmp2, <4 x i16>* %p, align 8
+ ret void
+}
+
+define <2 x i32> @facge_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: facge_2s:
+;CHECK: facge.2s
+ %tmp1 = load <2 x float>* %A
+ %tmp2 = load <2 x float>* %B
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.facge.v2i32.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @facge_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: facge_4s:
+;CHECK: facge.4s
+ %tmp1 = load <4 x float>* %A
+ %tmp2 = load <4 x float>* %B
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.facge.v4i32.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @facge_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: facge_2d:
+;CHECK: facge.2d
+ %tmp1 = load <2 x double>* %A
+ %tmp2 = load <2 x double>* %B
+ %tmp3 = call <2 x i64> @llvm.aarch64.neon.facge.v2i64.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+ ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.aarch64.neon.facge.v2i32.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.facge.v4i32.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.facge.v2i64.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x i32> @facgt_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: facgt_2s:
+;CHECK: facgt.2s
+ %tmp1 = load <2 x float>* %A
+ %tmp2 = load <2 x float>* %B
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.facgt.v2i32.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @facgt_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: facgt_4s:
+;CHECK: facgt.4s
+ %tmp1 = load <4 x float>* %A
+ %tmp2 = load <4 x float>* %B
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.facgt.v4i32.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @facgt_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: facgt_2d:
+;CHECK: facgt.2d
+ %tmp1 = load <2 x double>* %A
+ %tmp2 = load <2 x double>* %B
+ %tmp3 = call <2 x i64> @llvm.aarch64.neon.facgt.v2i64.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+ ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.aarch64.neon.facgt.v2i32.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.facgt.v4i32.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.facgt.v2i64.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define i32 @facge_s(float %A, float %B) nounwind {
+; CHECK-LABEL: facge_s:
+; CHECK: facge {{s[0-9]+}}, s0, s1
+ %mask = call i32 @llvm.aarch64.neon.facge.i32.f32(float %A, float %B)
+ ret i32 %mask
+}
+
+define i64 @facge_d(double %A, double %B) nounwind {
+; CHECK-LABEL: facge_d:
+; CHECK: facge {{d[0-9]+}}, d0, d1
+ %mask = call i64 @llvm.aarch64.neon.facge.i64.f64(double %A, double %B)
+ ret i64 %mask
+}
+
+declare i64 @llvm.aarch64.neon.facge.i64.f64(double, double)
+declare i32 @llvm.aarch64.neon.facge.i32.f32(float, float)
+
+define i32 @facgt_s(float %A, float %B) nounwind {
+; CHECK-LABEL: facgt_s:
+; CHECK: facgt {{s[0-9]+}}, s0, s1
+ %mask = call i32 @llvm.aarch64.neon.facgt.i32.f32(float %A, float %B)
+ ret i32 %mask
+}
+
+define i64 @facgt_d(double %A, double %B) nounwind {
+; CHECK-LABEL: facgt_d:
+; CHECK: facgt {{d[0-9]+}}, d0, d1
+ %mask = call i64 @llvm.aarch64.neon.facgt.i64.f64(double %A, double %B)
+ ret i64 %mask
+}
+
+declare i64 @llvm.aarch64.neon.facgt.i64.f64(double, double)
+declare i32 @llvm.aarch64.neon.facgt.i32.f32(float, float)
+
+define <8 x i8> @cmtst_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: cmtst_8b:
+;CHECK: cmtst.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %commonbits = and <8 x i8> %tmp1, %tmp2
+ %mask = icmp ne <8 x i8> %commonbits, zeroinitializer
+ %res = sext <8 x i1> %mask to <8 x i8>
+ ret <8 x i8> %res
+}
+
+define <16 x i8> @cmtst_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: cmtst_16b:
+;CHECK: cmtst.16b
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %commonbits = and <16 x i8> %tmp1, %tmp2
+ %mask = icmp ne <16 x i8> %commonbits, zeroinitializer
+ %res = sext <16 x i1> %mask to <16 x i8>
+ ret <16 x i8> %res
+}
+
+define <4 x i16> @cmtst_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: cmtst_4h:
+;CHECK: cmtst.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %commonbits = and <4 x i16> %tmp1, %tmp2
+ %mask = icmp ne <4 x i16> %commonbits, zeroinitializer
+ %res = sext <4 x i1> %mask to <4 x i16>
+ ret <4 x i16> %res
+}
+
+define <8 x i16> @cmtst_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: cmtst_8h:
+;CHECK: cmtst.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %commonbits = and <8 x i16> %tmp1, %tmp2
+ %mask = icmp ne <8 x i16> %commonbits, zeroinitializer
+ %res = sext <8 x i1> %mask to <8 x i16>
+ ret <8 x i16> %res
+}
+
+define <2 x i32> @cmtst_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: cmtst_2s:
+;CHECK: cmtst.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %commonbits = and <2 x i32> %tmp1, %tmp2
+ %mask = icmp ne <2 x i32> %commonbits, zeroinitializer
+ %res = sext <2 x i1> %mask to <2 x i32>
+ ret <2 x i32> %res
+}
+
+define <4 x i32> @cmtst_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: cmtst_4s:
+;CHECK: cmtst.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %commonbits = and <4 x i32> %tmp1, %tmp2
+ %mask = icmp ne <4 x i32> %commonbits, zeroinitializer
+ %res = sext <4 x i1> %mask to <4 x i32>
+ ret <4 x i32> %res
+}
+
+define <2 x i64> @cmtst_2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: cmtst_2d:
+;CHECK: cmtst.2d
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i64>* %B
+ %commonbits = and <2 x i64> %tmp1, %tmp2
+ %mask = icmp ne <2 x i64> %commonbits, zeroinitializer
+ %res = sext <2 x i1> %mask to <2 x i64>
+ ret <2 x i64> %res
+}
+
+define <1 x i64> @fcmeq_d(<1 x double> %A, <1 x double> %B) nounwind {
+; CHECK-LABEL: fcmeq_d:
+; CHECK: fcmeq {{d[0-9]+}}, d0, d1
+ %tst = fcmp oeq <1 x double> %A, %B
+ %mask = sext <1 x i1> %tst to <1 x i64>
+ ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmge_d(<1 x double> %A, <1 x double> %B) nounwind {
+; CHECK-LABEL: fcmge_d:
+; CHECK: fcmge {{d[0-9]+}}, d0, d1
+ %tst = fcmp oge <1 x double> %A, %B
+ %mask = sext <1 x i1> %tst to <1 x i64>
+ ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmle_d(<1 x double> %A, <1 x double> %B) nounwind {
+; CHECK-LABEL: fcmle_d:
+; CHECK: fcmge {{d[0-9]+}}, d1, d0
+ %tst = fcmp ole <1 x double> %A, %B
+ %mask = sext <1 x i1> %tst to <1 x i64>
+ ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmgt_d(<1 x double> %A, <1 x double> %B) nounwind {
+; CHECK-LABEL: fcmgt_d:
+; CHECK: fcmgt {{d[0-9]+}}, d0, d1
+ %tst = fcmp ogt <1 x double> %A, %B
+ %mask = sext <1 x i1> %tst to <1 x i64>
+ ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmlt_d(<1 x double> %A, <1 x double> %B) nounwind {
+; CHECK-LABEL: fcmlt_d:
+; CHECK: fcmgt {{d[0-9]+}}, d1, d0
+ %tst = fcmp olt <1 x double> %A, %B
+ %mask = sext <1 x i1> %tst to <1 x i64>
+ ret <1 x i64> %mask
+}
+
+define <1 x i64> @cmnez_d(<1 x i64> %A) nounwind {
+; CHECK-LABEL: cmnez_d:
+; CHECK: cmeq d[[EQ:[0-9]+]], d0, #0
+; CHECK: mvn.8b v0, v[[EQ]]
+ %tst = icmp ne <1 x i64> %A, zeroinitializer
+ %mask = sext <1 x i1> %tst to <1 x i64>
+ ret <1 x i64> %mask
+}
diff --git a/test/CodeGen/AArch64/arm64-vcnt.ll b/test/CodeGen/AArch64/arm64-vcnt.ll
new file mode 100644
index 0000000..903501e
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vcnt.ll
@@ -0,0 +1,56 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @cls_8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: cls_8b:
+;CHECK: cls.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.cls.v8i8(<8 x i8> %tmp1)
+ ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @cls_16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: cls_16b:
+;CHECK: cls.16b
+ %tmp1 = load <16 x i8>* %A
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.cls.v16i8(<16 x i8> %tmp1)
+ ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @cls_4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: cls_4h:
+;CHECK: cls.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.cls.v4i16(<4 x i16> %tmp1)
+ ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @cls_8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: cls_8h:
+;CHECK: cls.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.cls.v8i16(<8 x i16> %tmp1)
+ ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @cls_2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: cls_2s:
+;CHECK: cls.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.cls.v2i32(<2 x i32> %tmp1)
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @cls_4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: cls_4s:
+;CHECK: cls.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.cls.v4i32(<4 x i32> %tmp1)
+ ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.cls.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.cls.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.cls.v4i16(<4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.cls.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.cls.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.cls.v4i32(<4 x i32>) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-vcombine.ll b/test/CodeGen/AArch64/arm64-vcombine.ll
new file mode 100644
index 0000000..fa12996
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vcombine.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+; LowerCONCAT_VECTORS() was reversing the order of two parts.
+; rdar://11558157
+; rdar://11559553
+define <16 x i8> @test(<16 x i8> %q0, <16 x i8> %q1, i8* nocapture %dest) nounwind {
+entry:
+; CHECK-LABEL: test:
+; CHECK: ins.d v0[1], v1[0]
+ %0 = bitcast <16 x i8> %q0 to <2 x i64>
+ %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> zeroinitializer
+ %1 = bitcast <16 x i8> %q1 to <2 x i64>
+ %shuffle.i4 = shufflevector <2 x i64> %1, <2 x i64> undef, <1 x i32> zeroinitializer
+ %shuffle.i3 = shufflevector <1 x i64> %shuffle.i, <1 x i64> %shuffle.i4, <2 x i32> <i32 0, i32 1>
+ %2 = bitcast <2 x i64> %shuffle.i3 to <16 x i8>
+ ret <16 x i8> %2
+}
diff --git a/test/CodeGen/AArch64/arm64-vcvt.ll b/test/CodeGen/AArch64/arm64-vcvt.ll
new file mode 100644
index 0000000..8c9e4e9
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vcvt.ll
@@ -0,0 +1,686 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <2 x i32> @fcvtas_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtas_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtas.2s v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtas.v2i32.v2f32(<2 x float> %A)
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtas_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtas_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtas.4s v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtas.v4i32.v4f32(<4 x float> %A)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtas_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtas_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtas.2d v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtas.v2i64.v2f64(<2 x double> %A)
+ ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.aarch64.neon.fcvtas.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.fcvtas.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.fcvtas.v2i64.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtau_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtau_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtau.2s v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtau.v2i32.v2f32(<2 x float> %A)
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtau_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtau_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtau.4s v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtau.v4i32.v4f32(<4 x float> %A)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtau_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtau_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtau.2d v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtau.v2i64.v2f64(<2 x double> %A)
+ ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.aarch64.neon.fcvtau.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.fcvtau.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.fcvtau.v2i64.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtms_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtms_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtms.2s v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtms.v2i32.v2f32(<2 x float> %A)
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtms_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtms_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtms.4s v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtms.v4i32.v4f32(<4 x float> %A)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtms_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtms_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtms.2d v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtms.v2i64.v2f64(<2 x double> %A)
+ ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.aarch64.neon.fcvtms.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.fcvtms.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.fcvtms.v2i64.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtmu_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtmu_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtmu.2s v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtmu.v2i32.v2f32(<2 x float> %A)
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtmu_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtmu_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtmu.4s v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtmu.v4i32.v4f32(<4 x float> %A)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtmu_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtmu_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtmu.2d v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtmu.v2i64.v2f64(<2 x double> %A)
+ ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.aarch64.neon.fcvtmu.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.fcvtmu.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.fcvtmu.v2i64.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtps_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtps_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtps.2s v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtps.v2i32.v2f32(<2 x float> %A)
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtps_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtps_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtps.4s v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtps.v4i32.v4f32(<4 x float> %A)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtps_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtps_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtps.2d v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtps.v2i64.v2f64(<2 x double> %A)
+ ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.aarch64.neon.fcvtps.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.fcvtps.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.fcvtps.v2i64.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtpu_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtpu_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtpu.2s v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtpu.v2i32.v2f32(<2 x float> %A)
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtpu_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtpu_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtpu.4s v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtpu.v4i32.v4f32(<4 x float> %A)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtpu_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtpu_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtpu.2d v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtpu.v2i64.v2f64(<2 x double> %A)
+ ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.aarch64.neon.fcvtpu.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.fcvtpu.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.fcvtpu.v2i64.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtns_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtns_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtns.2s v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtns.v2i32.v2f32(<2 x float> %A)
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtns_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtns_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtns.4s v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtns.v4i32.v4f32(<4 x float> %A)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtns_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtns_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtns.2d v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtns.v2i64.v2f64(<2 x double> %A)
+ ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.aarch64.neon.fcvtns.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.fcvtns.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.fcvtns.v2i64.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtnu_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtnu_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtnu.2s v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtnu.v2i32.v2f32(<2 x float> %A)
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtnu_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtnu_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtnu.4s v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtnu.v4i32.v4f32(<4 x float> %A)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtnu_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtnu_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtnu.2d v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtnu.v2i64.v2f64(<2 x double> %A)
+ ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.aarch64.neon.fcvtnu.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.fcvtnu.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.fcvtnu.v2i64.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtzs_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtzs_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtzs.2s v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = fptosi <2 x float> %A to <2 x i32>
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtzs_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtzs_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtzs.4s v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = fptosi <4 x float> %A to <4 x i32>
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtzs_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtzs_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtzs.2d v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = fptosi <2 x double> %A to <2 x i64>
+ ret <2 x i64> %tmp3
+}
+
+
+define <2 x i32> @fcvtzu_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtzu_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtzu.2s v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = fptoui <2 x float> %A to <2 x i32>
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtzu_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtzu_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtzu.4s v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = fptoui <4 x float> %A to <4 x i32>
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtzu_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtzu_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtzu.2d v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = fptoui <2 x double> %A to <2 x i64>
+ ret <2 x i64> %tmp3
+}
+
+define <2 x float> @frinta_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: frinta_2s:
+;CHECK-NOT: ld1
+;CHECK: frinta.2s v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <2 x float> @llvm.round.v2f32(<2 x float> %A)
+ ret <2 x float> %tmp3
+}
+
+define <4 x float> @frinta_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: frinta_4s:
+;CHECK-NOT: ld1
+;CHECK: frinta.4s v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <4 x float> @llvm.round.v4f32(<4 x float> %A)
+ ret <4 x float> %tmp3
+}
+
+define <2 x double> @frinta_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: frinta_2d:
+;CHECK-NOT: ld1
+;CHECK: frinta.2d v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <2 x double> @llvm.round.v2f64(<2 x double> %A)
+ ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.round.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.round.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.round.v2f64(<2 x double>) nounwind readnone
+
+define <2 x float> @frinti_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: frinti_2s:
+;CHECK-NOT: ld1
+;CHECK: frinti.2s v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %A)
+ ret <2 x float> %tmp3
+}
+
+define <4 x float> @frinti_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: frinti_4s:
+;CHECK-NOT: ld1
+;CHECK: frinti.4s v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %A)
+ ret <4 x float> %tmp3
+}
+
+define <2 x double> @frinti_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: frinti_2d:
+;CHECK-NOT: ld1
+;CHECK: frinti.2d v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %A)
+ ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.nearbyint.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>) nounwind readnone
+
+define <2 x float> @frintm_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: frintm_2s:
+;CHECK-NOT: ld1
+;CHECK: frintm.2s v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <2 x float> @llvm.floor.v2f32(<2 x float> %A)
+ ret <2 x float> %tmp3
+}
+
+define <4 x float> @frintm_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: frintm_4s:
+;CHECK-NOT: ld1
+;CHECK: frintm.4s v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <4 x float> @llvm.floor.v4f32(<4 x float> %A)
+ ret <4 x float> %tmp3
+}
+
+define <2 x double> @frintm_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: frintm_2d:
+;CHECK-NOT: ld1
+;CHECK: frintm.2d v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <2 x double> @llvm.floor.v2f64(<2 x double> %A)
+ ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.floor.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.floor.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.floor.v2f64(<2 x double>) nounwind readnone
+
+define <2 x float> @frintn_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: frintn_2s:
+;CHECK-NOT: ld1
+;CHECK: frintn.2s v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <2 x float> @llvm.aarch64.neon.frintn.v2f32(<2 x float> %A)
+ ret <2 x float> %tmp3
+}
+
+define <4 x float> @frintn_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: frintn_4s:
+;CHECK-NOT: ld1
+;CHECK: frintn.4s v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <4 x float> @llvm.aarch64.neon.frintn.v4f32(<4 x float> %A)
+ ret <4 x float> %tmp3
+}
+
+define <2 x double> @frintn_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: frintn_2d:
+;CHECK-NOT: ld1
+;CHECK: frintn.2d v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <2 x double> @llvm.aarch64.neon.frintn.v2f64(<2 x double> %A)
+ ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.frintn.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.frintn.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.frintn.v2f64(<2 x double>) nounwind readnone
+
+define <2 x float> @frintp_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: frintp_2s:
+;CHECK-NOT: ld1
+;CHECK: frintp.2s v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <2 x float> @llvm.ceil.v2f32(<2 x float> %A)
+ ret <2 x float> %tmp3
+}
+
+define <4 x float> @frintp_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: frintp_4s:
+;CHECK-NOT: ld1
+;CHECK: frintp.4s v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %A)
+ ret <4 x float> %tmp3
+}
+
+define <2 x double> @frintp_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: frintp_2d:
+;CHECK-NOT: ld1
+;CHECK: frintp.2d v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <2 x double> @llvm.ceil.v2f64(<2 x double> %A)
+ ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.ceil.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.ceil.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.ceil.v2f64(<2 x double>) nounwind readnone
+
+define <2 x float> @frintx_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: frintx_2s:
+;CHECK-NOT: ld1
+;CHECK: frintx.2s v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <2 x float> @llvm.rint.v2f32(<2 x float> %A)
+ ret <2 x float> %tmp3
+}
+
+define <4 x float> @frintx_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: frintx_4s:
+;CHECK-NOT: ld1
+;CHECK: frintx.4s v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <4 x float> @llvm.rint.v4f32(<4 x float> %A)
+ ret <4 x float> %tmp3
+}
+
+define <2 x double> @frintx_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: frintx_2d:
+;CHECK-NOT: ld1
+;CHECK: frintx.2d v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <2 x double> @llvm.rint.v2f64(<2 x double> %A)
+ ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.rint.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.rint.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.rint.v2f64(<2 x double>) nounwind readnone
+
+define <2 x float> @frintz_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: frintz_2s:
+;CHECK-NOT: ld1
+;CHECK: frintz.2s v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <2 x float> @llvm.trunc.v2f32(<2 x float> %A)
+ ret <2 x float> %tmp3
+}
+
+define <4 x float> @frintz_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: frintz_4s:
+;CHECK-NOT: ld1
+;CHECK: frintz.4s v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <4 x float> @llvm.trunc.v4f32(<4 x float> %A)
+ ret <4 x float> %tmp3
+}
+
+define <2 x double> @frintz_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: frintz_2d:
+;CHECK-NOT: ld1
+;CHECK: frintz.2d v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <2 x double> @llvm.trunc.v2f64(<2 x double> %A)
+ ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.trunc.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.trunc.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.trunc.v2f64(<2 x double>) nounwind readnone
+
+define <2 x float> @fcvtxn_2s(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtxn_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtxn v0.2s, v0.2d
+;CHECK-NEXT: ret
+ %tmp3 = call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %A)
+ ret <2 x float> %tmp3
+}
+
+define <4 x float> @fcvtxn_4s(<2 x float> %ret, <2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtxn_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtxn2 v0.4s, v1.2d
+;CHECK-NEXT: ret
+ %tmp3 = call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %A)
+ %res = shufflevector <2 x float> %ret, <2 x float> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x float> %res
+}
+
+declare <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtzsc_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtzsc_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtzs.2s v0, v0, #1
+;CHECK-NEXT: ret
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> %A, i32 1)
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtzsc_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtzsc_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtzs.4s v0, v0, #1
+;CHECK-NEXT: ret
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> %A, i32 1)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtzsc_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtzsc_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtzs.2d v0, v0, #1
+;CHECK-NEXT: ret
+ %tmp3 = call <2 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double> %A, i32 1)
+ ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float>, i32) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float>, i32) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double>, i32) nounwind readnone
+
+define <2 x i32> @fcvtzuc_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtzuc_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtzu.2s v0, v0, #1
+;CHECK-NEXT: ret
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> %A, i32 1)
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtzuc_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtzuc_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtzu.4s v0, v0, #1
+;CHECK-NEXT: ret
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> %A, i32 1)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtzuc_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtzuc_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtzu.2d v0, v0, #1
+;CHECK-NEXT: ret
+ %tmp3 = call <2 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double> %A, i32 1)
+ ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float>, i32) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float>, i32) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double>, i32) nounwind readnone
+
+define <2 x float> @scvtf_2sc(<2 x i32> %A) nounwind {
+;CHECK-LABEL: scvtf_2sc:
+;CHECK-NOT: ld1
+;CHECK: scvtf.2s v0, v0, #1
+;CHECK-NEXT: ret
+ %tmp3 = call <2 x float> @llvm.aarch64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> %A, i32 1)
+ ret <2 x float> %tmp3
+}
+
+define <4 x float> @scvtf_4sc(<4 x i32> %A) nounwind {
+;CHECK-LABEL: scvtf_4sc:
+;CHECK-NOT: ld1
+;CHECK: scvtf.4s v0, v0, #1
+;CHECK-NEXT: ret
+ %tmp3 = call <4 x float> @llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> %A, i32 1)
+ ret <4 x float> %tmp3
+}
+
+define <2 x double> @scvtf_2dc(<2 x i64> %A) nounwind {
+;CHECK-LABEL: scvtf_2dc:
+;CHECK-NOT: ld1
+;CHECK: scvtf.2d v0, v0, #1
+;CHECK-NEXT: ret
+ %tmp3 = call <2 x double> @llvm.aarch64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64> %A, i32 1)
+ ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
+
+define <2 x float> @ucvtf_2sc(<2 x i32> %A) nounwind {
+;CHECK-LABEL: ucvtf_2sc:
+;CHECK-NOT: ld1
+;CHECK: ucvtf.2s v0, v0, #1
+;CHECK-NEXT: ret
+ %tmp3 = call <2 x float> @llvm.aarch64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> %A, i32 1)
+ ret <2 x float> %tmp3
+}
+
+define <4 x float> @ucvtf_4sc(<4 x i32> %A) nounwind {
+;CHECK-LABEL: ucvtf_4sc:
+;CHECK-NOT: ld1
+;CHECK: ucvtf.4s v0, v0, #1
+;CHECK-NEXT: ret
+ %tmp3 = call <4 x float> @llvm.aarch64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> %A, i32 1)
+ ret <4 x float> %tmp3
+}
+
+define <2 x double> @ucvtf_2dc(<2 x i64> %A) nounwind {
+;CHECK-LABEL: ucvtf_2dc:
+;CHECK-NOT: ld1
+;CHECK: ucvtf.2d v0, v0, #1
+;CHECK-NEXT: ret
+ %tmp3 = call <2 x double> @llvm.aarch64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64> %A, i32 1)
+ ret <2 x double> %tmp3
+}
+
+
+;CHECK-LABEL: autogen_SD28458:
+;CHECK: fcvt
+;CHECK: ret
+define void @autogen_SD28458() {
+ %Tr53 = fptrunc <8 x double> undef to <8 x float>
+ store <8 x float> %Tr53, <8 x float>* undef
+ ret void
+}
+
+;CHECK-LABEL: autogen_SD19225:
+;CHECK: fcvt
+;CHECK: ret
+define void @autogen_SD19225() {
+ %A = load <8 x float>* undef
+ %Tr53 = fpext <8 x float> %A to <8 x double>
+ store <8 x double> %Tr53, <8 x double>* undef
+ ret void
+}
+
+declare <2 x float> @llvm.aarch64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-vcvt_f.ll b/test/CodeGen/AArch64/arm64-vcvt_f.ll
new file mode 100644
index 0000000..d244958
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vcvt_f.ll
@@ -0,0 +1,82 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -O0 -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <2 x double> @test_vcvt_f64_f32(<2 x float> %x) nounwind readnone ssp {
+; CHECK-LABEL: test_vcvt_f64_f32:
+ %vcvt1.i = fpext <2 x float> %x to <2 x double>
+; CHECK: fcvtl v0.2d, v0.2s
+ ret <2 x double> %vcvt1.i
+; CHECK: ret
+}
+
+define <2 x double> @test_vcvt_high_f64_f32(<4 x float> %x) nounwind readnone ssp {
+; CHECK-LABEL: test_vcvt_high_f64_f32:
+ %cvt_in = shufflevector <4 x float> %x, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+ %vcvt1.i = fpext <2 x float> %cvt_in to <2 x double>
+; CHECK: fcvtl2 v0.2d, v0.4s
+ ret <2 x double> %vcvt1.i
+; CHECK: ret
+}
+
+define <2 x float> @test_vcvt_f32_f64(<2 x double> %v) nounwind readnone ssp {
+; CHECK-LABEL: test_vcvt_f32_f64:
+ %vcvt1.i = fptrunc <2 x double> %v to <2 x float>
+; CHECK: fcvtn
+ ret <2 x float> %vcvt1.i
+; CHECK: ret
+}
+
+define <4 x float> @test_vcvt_high_f32_f64(<2 x float> %x, <2 x double> %v) nounwind readnone ssp {
+; CHECK-LABEL: test_vcvt_high_f32_f64:
+
+ %cvt = fptrunc <2 x double> %v to <2 x float>
+ %vcvt2.i = shufflevector <2 x float> %x, <2 x float> %cvt, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK: fcvtn2
+ ret <4 x float> %vcvt2.i
+; CHECK: ret
+}
+
+define <2 x float> @test_vcvtx_f32_f64(<2 x double> %v) nounwind readnone ssp {
+; CHECK-LABEL: test_vcvtx_f32_f64:
+ %vcvtx1.i = tail call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %v) nounwind
+; CHECK: fcvtxn
+ ret <2 x float> %vcvtx1.i
+; CHECK: ret
+}
+
+define <4 x float> @test_vcvtx_high_f32_f64(<2 x float> %x, <2 x double> %v) nounwind readnone ssp {
+; CHECK-LABEL: test_vcvtx_high_f32_f64:
+ %vcvtx2.i = tail call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %v) nounwind
+ %res = shufflevector <2 x float> %x, <2 x float> %vcvtx2.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK: fcvtxn2
+ ret <4 x float> %res
+; CHECK: ret
+}
+
+
+declare <2 x double> @llvm.aarch64.neon.vcvthighfp2df(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.vcvtfp2df(<2 x float>) nounwind readnone
+
+declare <2 x float> @llvm.aarch64.neon.vcvtdf2fp(<2 x double>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.vcvthighdf2fp(<2 x float>, <2 x double>) nounwind readnone
+
+declare <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double>) nounwind readnone
+
+define i16 @to_half(float %in) {
+; CHECK-LABEL: to_half:
+; CHECK: fcvt h[[HALFVAL:[0-9]+]], s0
+; CHECK: fmov {{w[0-9]+}}, {{s[0-9]+}}
+ %res = call i16 @llvm.convert.to.fp16(float %in)
+ ret i16 %res
+}
+
+define float @from_half(i16 %in) {
+; CHECK-LABEL: from_half:
+; CHECK: fmov s[[HALFVAL:[0-9]+]], {{w[0-9]+}}
+; CHECK: fcvt s0, h[[HALFVAL]]
+ %res = call float @llvm.convert.from.fp16(i16 %in)
+ ret float %res
+}
+
+declare float @llvm.convert.from.fp16(i16) #1
+declare i16 @llvm.convert.to.fp16(float) #1
diff --git a/test/CodeGen/AArch64/arm64-vcvt_f32_su32.ll b/test/CodeGen/AArch64/arm64-vcvt_f32_su32.ll
new file mode 100644
index 0000000..1eb7b43
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vcvt_f32_su32.ll
@@ -0,0 +1,73 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <2 x float> @ucvt(<2 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: ucvt:
+; CHECK: ucvtf.2s v0, v0
+; CHECK: ret
+
+ %vcvt.i = uitofp <2 x i32> %a to <2 x float>
+ ret <2 x float> %vcvt.i
+}
+
+define <2 x float> @scvt(<2 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: scvt:
+; CHECK: scvtf.2s v0, v0
+; CHECK: ret
+ %vcvt.i = sitofp <2 x i32> %a to <2 x float>
+ ret <2 x float> %vcvt.i
+}
+
+define <4 x float> @ucvtq(<4 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: ucvtq:
+; CHECK: ucvtf.4s v0, v0
+; CHECK: ret
+ %vcvt.i = uitofp <4 x i32> %a to <4 x float>
+ ret <4 x float> %vcvt.i
+}
+
+define <4 x float> @scvtq(<4 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: scvtq:
+; CHECK: scvtf.4s v0, v0
+; CHECK: ret
+ %vcvt.i = sitofp <4 x i32> %a to <4 x float>
+ ret <4 x float> %vcvt.i
+}
+
+define <4 x float> @cvtf16(<4 x i16> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtf16:
+; CHECK: fcvtl v0.4s, v0.4h
+; CHECK-NEXT: ret
+ %vcvt1.i = tail call <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16> %a) nounwind
+ ret <4 x float> %vcvt1.i
+}
+
+define <4 x float> @cvtf16_high(<8 x i16> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtf16_high:
+; CHECK: fcvtl2 v0.4s, v0.8h
+; CHECK-NEXT: ret
+ %in = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %vcvt1.i = tail call <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16> %in) nounwind
+ ret <4 x float> %vcvt1.i
+}
+
+
+
+define <4 x i16> @cvtf16f32(<4 x float> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtf16f32:
+; CHECK: fcvtn v0.4h, v0.4s
+; CHECK-NEXT: ret
+ %vcvt1.i = tail call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> %a) nounwind
+ ret <4 x i16> %vcvt1.i
+}
+
+define <8 x i16> @cvtf16f32_high(<4 x i16> %low, <4 x float> %high_big) {
+; CHECK-LABEL: cvtf16f32_high:
+; CHECK: fcvtn2 v0.8h, v1.4s
+; CHECK-NEXT: ret
+ %high = call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> %high_big)
+ %res = shufflevector <4 x i16> %low, <4 x i16> %high, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i16> %res
+}
+
+declare <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float>) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-vcvt_n.ll b/test/CodeGen/AArch64/arm64-vcvt_n.ll
new file mode 100644
index 0000000..7ed5be6
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vcvt_n.ll
@@ -0,0 +1,49 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <2 x float> @cvtf32fxpu(<2 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtf32fxpu:
+; CHECK: ucvtf.2s v0, v0, #9
+; CHECK: ret
+ %vcvt_n1 = tail call <2 x float> @llvm.aarch64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> %a, i32 9)
+ ret <2 x float> %vcvt_n1
+}
+
+define <2 x float> @cvtf32fxps(<2 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtf32fxps:
+; CHECK: scvtf.2s v0, v0, #12
+; CHECK: ret
+ %vcvt_n1 = tail call <2 x float> @llvm.aarch64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> %a, i32 12)
+ ret <2 x float> %vcvt_n1
+}
+
+define <4 x float> @cvtqf32fxpu(<4 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtqf32fxpu:
+; CHECK: ucvtf.4s v0, v0, #18
+; CHECK: ret
+ %vcvt_n1 = tail call <4 x float> @llvm.aarch64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> %a, i32 18)
+ ret <4 x float> %vcvt_n1
+}
+
+define <4 x float> @cvtqf32fxps(<4 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtqf32fxps:
+; CHECK: scvtf.4s v0, v0, #30
+; CHECK: ret
+ %vcvt_n1 = tail call <4 x float> @llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> %a, i32 30)
+ ret <4 x float> %vcvt_n1
+}
+define <2 x double> @f1(<2 x i64> %a) nounwind readnone ssp {
+ %vcvt_n1 = tail call <2 x double> @llvm.aarch64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64> %a, i32 12)
+ ret <2 x double> %vcvt_n1
+}
+
+define <2 x double> @f2(<2 x i64> %a) nounwind readnone ssp {
+ %vcvt_n1 = tail call <2 x double> @llvm.aarch64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64> %a, i32 9)
+ ret <2 x double> %vcvt_n1
+}
+
+declare <4 x float> @llvm.aarch64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-vcvt_su32_f32.ll b/test/CodeGen/AArch64/arm64-vcvt_su32_f32.ll
new file mode 100644
index 0000000..985a5f7
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vcvt_su32_f32.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <2 x i32> @c1(<2 x float> %a) nounwind readnone ssp {
+; CHECK: c1
+; CHECK: fcvtzs.2s v0, v0
+; CHECK: ret
+ %vcvt.i = fptosi <2 x float> %a to <2 x i32>
+ ret <2 x i32> %vcvt.i
+}
+
+define <2 x i32> @c2(<2 x float> %a) nounwind readnone ssp {
+; CHECK: c2
+; CHECK: fcvtzu.2s v0, v0
+; CHECK: ret
+ %vcvt.i = fptoui <2 x float> %a to <2 x i32>
+ ret <2 x i32> %vcvt.i
+}
+
+define <4 x i32> @c3(<4 x float> %a) nounwind readnone ssp {
+; CHECK: c3
+; CHECK: fcvtzs.4s v0, v0
+; CHECK: ret
+ %vcvt.i = fptosi <4 x float> %a to <4 x i32>
+ ret <4 x i32> %vcvt.i
+}
+
+define <4 x i32> @c4(<4 x float> %a) nounwind readnone ssp {
+; CHECK: c4
+; CHECK: fcvtzu.4s v0, v0
+; CHECK: ret
+ %vcvt.i = fptoui <4 x float> %a to <4 x i32>
+ ret <4 x i32> %vcvt.i
+}
+
diff --git a/test/CodeGen/AArch64/arm64-vcvtxd_f32_f64.ll b/test/CodeGen/AArch64/arm64-vcvtxd_f32_f64.ll
new file mode 100644
index 0000000..b29c22c
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vcvtxd_f32_f64.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define float @fcvtxn(double %a) {
+; CHECK-LABEL: fcvtxn:
+; CHECK: fcvtxn s0, d0
+; CHECK-NEXT: ret
+ %vcvtxd.i = tail call float @llvm.aarch64.sisd.fcvtxn(double %a) nounwind
+ ret float %vcvtxd.i
+}
+
+declare float @llvm.aarch64.sisd.fcvtxn(double) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-vecCmpBr.ll b/test/CodeGen/AArch64/arm64-vecCmpBr.ll
new file mode 100644
index 0000000..c7321e4
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vecCmpBr.ll
@@ -0,0 +1,207 @@
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s -mcpu=cyclone | FileCheck %s
+; ModuleID = 'arm64_vecCmpBr.c'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+target triple = "arm64-apple-ios3.0.0"
+
+
+define i32 @anyZero64(<4 x i16> %a) #0 {
+; CHECK: _anyZero64:
+; CHECK: uminv.8b b[[REGNO1:[0-9]+]], v0
+; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: b _bar
+entry:
+ %0 = bitcast <4 x i16> %a to <8 x i8>
+ %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8> %0) #3
+ %1 = trunc i32 %vminv.i to i8
+ %tobool = icmp eq i8 %1, 0
+ br i1 %tobool, label %if.then, label %return
+
+if.then: ; preds = %entry
+ %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
+ br label %return
+
+return: ; preds = %entry, %if.then
+ %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+ ret i32 %retval.0
+}
+
+declare i32 @bar(...) #1
+
+define i32 @anyZero128(<8 x i16> %a) #0 {
+; CHECK: _anyZero128:
+; CHECK: uminv.16b b[[REGNO1:[0-9]+]], v0
+; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: b _bar
+
+entry:
+ %0 = bitcast <8 x i16> %a to <16 x i8>
+ %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8> %0) #3
+ %1 = trunc i32 %vminv.i to i8
+ %tobool = icmp eq i8 %1, 0
+ br i1 %tobool, label %if.then, label %return
+
+if.then: ; preds = %entry
+ %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
+ br label %return
+
+return: ; preds = %entry, %if.then
+ %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+ ret i32 %retval.0
+}
+
+define i32 @anyNonZero64(<4 x i16> %a) #0 {
+; CHECK: _anyNonZero64:
+; CHECK: umaxv.8b b[[REGNO1:[0-9]+]], v0
+; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: movz w0, #0
+
+entry:
+ %0 = bitcast <4 x i16> %a to <8 x i8>
+ %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8> %0) #3
+ %1 = trunc i32 %vmaxv.i to i8
+ %tobool = icmp eq i8 %1, 0
+ br i1 %tobool, label %return, label %if.then
+
+if.then: ; preds = %entry
+ %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
+ br label %return
+
+return: ; preds = %entry, %if.then
+ %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+ ret i32 %retval.0
+}
+
+define i32 @anyNonZero128(<8 x i16> %a) #0 {
+; CHECK: _anyNonZero128:
+; CHECK: umaxv.16b b[[REGNO1:[0-9]+]], v0
+; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: movz w0, #0
+entry:
+ %0 = bitcast <8 x i16> %a to <16 x i8>
+ %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8> %0) #3
+ %1 = trunc i32 %vmaxv.i to i8
+ %tobool = icmp eq i8 %1, 0
+ br i1 %tobool, label %return, label %if.then
+
+if.then: ; preds = %entry
+ %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
+ br label %return
+
+return: ; preds = %entry, %if.then
+ %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+ ret i32 %retval.0
+}
+
+define i32 @allZero64(<4 x i16> %a) #0 {
+; CHECK: _allZero64:
+; CHECK: umaxv.8b b[[REGNO1:[0-9]+]], v0
+; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: b _bar
+entry:
+ %0 = bitcast <4 x i16> %a to <8 x i8>
+ %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8> %0) #3
+ %1 = trunc i32 %vmaxv.i to i8
+ %tobool = icmp eq i8 %1, 0
+ br i1 %tobool, label %if.then, label %return
+
+if.then: ; preds = %entry
+ %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
+ br label %return
+
+return: ; preds = %entry, %if.then
+ %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+ ret i32 %retval.0
+}
+
+define i32 @allZero128(<8 x i16> %a) #0 {
+; CHECK: _allZero128:
+; CHECK: umaxv.16b b[[REGNO1:[0-9]+]], v0
+; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: b _bar
+entry:
+ %0 = bitcast <8 x i16> %a to <16 x i8>
+ %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8> %0) #3
+ %1 = trunc i32 %vmaxv.i to i8
+ %tobool = icmp eq i8 %1, 0
+ br i1 %tobool, label %if.then, label %return
+
+if.then: ; preds = %entry
+ %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
+ br label %return
+
+return: ; preds = %entry, %if.then
+ %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+ ret i32 %retval.0
+}
+
+define i32 @allNonZero64(<4 x i16> %a) #0 {
+; CHECK: _allNonZero64:
+; CHECK: uminv.8b b[[REGNO1:[0-9]+]], v0
+; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: movz w0, #0
+entry:
+ %0 = bitcast <4 x i16> %a to <8 x i8>
+ %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8> %0) #3
+ %1 = trunc i32 %vminv.i to i8
+ %tobool = icmp eq i8 %1, 0
+ br i1 %tobool, label %return, label %if.then
+
+if.then: ; preds = %entry
+ %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
+ br label %return
+
+return: ; preds = %entry, %if.then
+ %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+ ret i32 %retval.0
+}
+
+define i32 @allNonZero128(<8 x i16> %a) #0 {
+; CHECK: _allNonZero128:
+; CHECK: uminv.16b b[[REGNO1:[0-9]+]], v0
+; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: movz w0, #0
+entry:
+ %0 = bitcast <8 x i16> %a to <16 x i8>
+ %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8> %0) #3
+ %1 = trunc i32 %vminv.i to i8
+ %tobool = icmp eq i8 %1, 0
+ br i1 %tobool, label %return, label %if.then
+
+if.then: ; preds = %entry
+ %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
+ br label %return
+
+return: ; preds = %entry, %if.then
+ %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+ ret i32 %retval.0
+}
+
+declare i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8>) #2
+
+declare i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8>) #2
+
+declare i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8>) #2
+
+declare i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8>) #2
+
+attributes #0 = { nounwind ssp "target-cpu"="cyclone" }
+attributes #1 = { "target-cpu"="cyclone" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nounwind }
+attributes #4 = { nobuiltin nounwind }
diff --git a/test/CodeGen/AArch64/arm64-vecFold.ll b/test/CodeGen/AArch64/arm64-vecFold.ll
new file mode 100644
index 0000000..aeacfcc
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vecFold.ll
@@ -0,0 +1,145 @@
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple -o - %s| FileCheck %s
+
+define <16 x i8> @foov16i8(<8 x i16> %a0, <8 x i16> %b0) nounwind readnone ssp {
+; CHECK-LABEL: foov16i8:
+ %vshrn_low_shift = lshr <8 x i16> %a0, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
+ %vshrn_low = trunc <8 x i16> %vshrn_low_shift to <8 x i8>
+ %vshrn_high_shift = lshr <8 x i16> %b0, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
+ %vshrn_high = trunc <8 x i16> %vshrn_high_shift to <8 x i8>
+; CHECK: shrn.8b v0, v0, #5
+; CHECK-NEXT: shrn2.16b v0, v1, #5
+; CHECK-NEXT: ret
+ %1 = bitcast <8 x i8> %vshrn_low to <1 x i64>
+ %2 = bitcast <8 x i8> %vshrn_high to <1 x i64>
+ %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+ %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+ ret <16 x i8> %3
+}
+
+define <8 x i16> @foov8i16(<4 x i32> %a0, <4 x i32> %b0) nounwind readnone ssp {
+; CHECK-LABEL: foov8i16:
+ %vshrn_low_shift = lshr <4 x i32> %a0, <i32 5, i32 5, i32 5, i32 5>
+ %vshrn_low = trunc <4 x i32> %vshrn_low_shift to <4 x i16>
+ %vshrn_high_shift = lshr <4 x i32> %b0, <i32 5, i32 5, i32 5, i32 5>
+ %vshrn_high = trunc <4 x i32> %vshrn_high_shift to <4 x i16>
+; CHECK: shrn.4h v0, v0, #5
+; CHECK-NEXT: shrn2.8h v0, v1, #5
+; CHECK-NEXT: ret
+ %1 = bitcast <4 x i16> %vshrn_low to <1 x i64>
+ %2 = bitcast <4 x i16> %vshrn_high to <1 x i64>
+ %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+ %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+ ret <8 x i16> %3
+}
+
+define <4 x i32> @foov4i32(<2 x i64> %a0, <2 x i64> %b0) nounwind readnone ssp {
+; CHECK-LABEL: foov4i32:
+ %vshrn_low_shift = lshr <2 x i64> %a0, <i64 5, i64 5>
+ %vshrn_low = trunc <2 x i64> %vshrn_low_shift to <2 x i32>
+ %vshrn_high_shift = lshr <2 x i64> %b0, <i64 5, i64 5>
+ %vshrn_high = trunc <2 x i64> %vshrn_high_shift to <2 x i32>
+; CHECK: shrn.2s v0, v0, #5
+; CHECK-NEXT: shrn2.4s v0, v1, #5
+; CHECK-NEXT: ret
+ %1 = bitcast <2 x i32> %vshrn_low to <1 x i64>
+ %2 = bitcast <2 x i32> %vshrn_high to <1 x i64>
+ %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+ %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+ ret <4 x i32> %3
+}
+
+define <8 x i16> @bar(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %b0, <4 x i32> %b1) nounwind readnone ssp {
+; CHECK-LABEL: bar:
+ %vaddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a0, <4 x i32> %a1) nounwind
+ %vaddhn2.i10 = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %b0, <4 x i32> %b1) nounwind
+; CHECK: addhn.4h v0, v0, v1
+; CHECK-NEXT: addhn2.8h v0, v2, v3
+; CHECK-NEXT: ret
+ %1 = bitcast <4 x i16> %vaddhn2.i to <1 x i64>
+ %2 = bitcast <4 x i16> %vaddhn2.i10 to <1 x i64>
+ %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+ %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+ ret <8 x i16> %3
+}
+
+define <8 x i16> @baz(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %b0, <4 x i32> %b1) nounwind readnone ssp {
+; CHECK-LABEL: baz:
+ %vaddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a0, <4 x i32> %a1) nounwind
+ %vshrn_high_shift = ashr <4 x i32> %b0, <i32 5, i32 5, i32 5, i32 5>
+ %vshrn_high = trunc <4 x i32> %vshrn_high_shift to <4 x i16>
+; CHECK: addhn.4h v0, v0, v1
+; CHECK-NEXT: shrn2.8h v0, v2, #5
+; CHECK-NEXT: ret
+ %1 = bitcast <4 x i16> %vaddhn2.i to <1 x i64>
+ %2 = bitcast <4 x i16> %vshrn_high to <1 x i64>
+ %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+ %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+ ret <8 x i16> %3
+}
+
+define <8 x i16> @raddhn(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %b0, <4 x i32> %b1) nounwind readnone ssp {
+; CHECK-LABEL: raddhn:
+entry:
+; CHECK: raddhn.4h v0, v0, v1
+; CHECK-NEXT: raddhn2.8h v0, v2, v3
+; CHECK-NEXT: ret
+ %vraddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a0, <4 x i32> %a1) nounwind
+ %vraddhn2.i10 = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %b0, <4 x i32> %b1) nounwind
+ %0 = bitcast <4 x i16> %vraddhn2.i to <1 x i64>
+ %1 = bitcast <4 x i16> %vraddhn2.i10 to <1 x i64>
+ %shuffle.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+ %2 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+ ret <8 x i16> %2
+}
+
+define <8 x i16> @vrshrn(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %b0, <8 x i16> %b1) nounwind readnone ssp {
+; CHECK-LABEL: vrshrn:
+; CHECK: rshrn.8b v0, v0, #5
+; CHECK-NEXT: rshrn2.16b v0, v2, #6
+; CHECK-NEXT: ret
+ %vrshrn_n1 = tail call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %a0, i32 5)
+ %vrshrn_n4 = tail call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %b0, i32 6)
+ %1 = bitcast <8 x i8> %vrshrn_n1 to <1 x i64>
+ %2 = bitcast <8 x i8> %vrshrn_n4 to <1 x i64>
+ %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+ %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+ ret <8 x i16> %3
+}
+
+define <8 x i16> @vrsubhn(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %b0, <8 x i16> %b1) nounwind readnone ssp {
+; CHECK-LABEL: vrsubhn:
+; CHECK: rsubhn.8b v0, v0, v1
+; CHECK: rsubhn2.16b v0, v2, v3
+; CHECK-NEXT: ret
+ %vrsubhn2.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a0, <8 x i16> %a1) nounwind
+ %vrsubhn2.i10 = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %b0, <8 x i16> %b1) nounwind
+ %1 = bitcast <8 x i8> %vrsubhn2.i to <1 x i64>
+ %2 = bitcast <8 x i8> %vrsubhn2.i10 to <1 x i64>
+ %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+ %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+ ret <8 x i16> %3
+}
+
+define <8 x i16> @noOpt1(<2 x i32> %a0, <2 x i32> %a1, <4 x i32> %b0, <4 x i32> %b1) nounwind readnone ssp {
+; CHECK-LABEL: noOpt1:
+ %vqsub2.i = tail call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %a0, <2 x i32> %a1) nounwind
+ %vaddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %b0, <4 x i32> %b1) nounwind
+; CHECK: sqsub.2s v0, v0, v1
+; CHECK-NEXT: addhn2.8h v0, v2, v3
+ %1 = bitcast <2 x i32> %vqsub2.i to <1 x i64>
+ %2 = bitcast <4 x i16> %vaddhn2.i to <1 x i64>
+ %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+ %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+ ret <8 x i16> %3
+}
+
+declare <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8> @llvm.aarch64.neon.shrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.shrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.shrn.v2i32(<2 x i64>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+
diff --git a/test/CodeGen/AArch64/arm64-vector-ext.ll b/test/CodeGen/AArch64/arm64-vector-ext.ll
new file mode 100644
index 0000000..650ff1e
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vector-ext.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+;CHECK: @func30
+;CHECK: ushll.4s v0, v0, #0
+;CHECK: movi.4s v1, #0x1
+;CHECK: and.16b v0, v0, v1
+;CHECK: str q0, [x0]
+;CHECK: ret
+
+%T0_30 = type <4 x i1>
+%T1_30 = type <4 x i32>
+define void @func30(%T0_30 %v0, %T1_30* %p1) {
+ %r = zext %T0_30 %v0 to %T1_30
+ store %T1_30 %r, %T1_30* %p1
+ ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-vector-imm.ll b/test/CodeGen/AArch64/arm64-vector-imm.ll
new file mode 100644
index 0000000..9fb088b
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vector-imm.ll
@@ -0,0 +1,134 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @v_orrimm(<8 x i8>* %A) nounwind {
+; CHECK-LABEL: v_orrimm:
+; CHECK-NOT: mov
+; CHECK-NOT: mvn
+; CHECK: orr
+ %tmp1 = load <8 x i8>* %A
+ %tmp3 = or <8 x i8> %tmp1, <i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1>
+ ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @v_orrimmQ(<16 x i8>* %A) nounwind {
+; CHECK: v_orrimmQ
+; CHECK-NOT: mov
+; CHECK-NOT: mvn
+; CHECK: orr
+ %tmp1 = load <16 x i8>* %A
+ %tmp3 = or <16 x i8> %tmp1, <i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1>
+ ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @v_bicimm(<8 x i8>* %A) nounwind {
+; CHECK-LABEL: v_bicimm:
+; CHECK-NOT: mov
+; CHECK-NOT: mvn
+; CHECK: bic
+ %tmp1 = load <8 x i8>* %A
+ %tmp3 = and <8 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0 >
+ ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @v_bicimmQ(<16 x i8>* %A) nounwind {
+; CHECK-LABEL: v_bicimmQ:
+; CHECK-NOT: mov
+; CHECK-NOT: mvn
+; CHECK: bic
+ %tmp1 = load <16 x i8>* %A
+ %tmp3 = and <16 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0 >
+ ret <16 x i8> %tmp3
+}
+
+define <2 x double> @foo(<2 x double> %bar) nounwind {
+; CHECK: foo
+; CHECK: fmov.2d v1, #1.0000000
+ %add = fadd <2 x double> %bar, <double 1.0, double 1.0>
+ ret <2 x double> %add
+}
+
+define <4 x i32> @movi_4s_imm_t1() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_4s_imm_t1:
+; CHECK: movi.4s v0, #0x4b
+ ret <4 x i32> <i32 75, i32 75, i32 75, i32 75>
+}
+
+define <4 x i32> @movi_4s_imm_t2() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_4s_imm_t2:
+; CHECK: movi.4s v0, #0x4b, lsl #8
+ ret <4 x i32> <i32 19200, i32 19200, i32 19200, i32 19200>
+}
+
+define <4 x i32> @movi_4s_imm_t3() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_4s_imm_t3:
+; CHECK: movi.4s v0, #0x4b, lsl #16
+ ret <4 x i32> <i32 4915200, i32 4915200, i32 4915200, i32 4915200>
+}
+
+define <4 x i32> @movi_4s_imm_t4() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_4s_imm_t4:
+; CHECK: movi.4s v0, #0x4b, lsl #24
+ ret <4 x i32> <i32 1258291200, i32 1258291200, i32 1258291200, i32 1258291200>
+}
+
+define <8 x i16> @movi_8h_imm_t5() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_8h_imm_t5:
+; CHECK: movi.8h v0, #0x4b
+ ret <8 x i16> <i16 75, i16 75, i16 75, i16 75, i16 75, i16 75, i16 75, i16 75>
+}
+
+; rdar://11989841
+define <8 x i16> @movi_8h_imm_t6() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_8h_imm_t6:
+; CHECK: movi.8h v0, #0x4b, lsl #8
+ ret <8 x i16> <i16 19200, i16 19200, i16 19200, i16 19200, i16 19200, i16 19200, i16 19200, i16 19200>
+}
+
+define <4 x i32> @movi_4s_imm_t7() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_4s_imm_t7:
+; CHECK: movi.4s v0, #0x4b, msl #8
+ret <4 x i32> <i32 19455, i32 19455, i32 19455, i32 19455>
+}
+
+define <4 x i32> @movi_4s_imm_t8() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_4s_imm_t8:
+; CHECK: movi.4s v0, #0x4b, msl #16
+ret <4 x i32> <i32 4980735, i32 4980735, i32 4980735, i32 4980735>
+}
+
+define <16 x i8> @movi_16b_imm_t9() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_16b_imm_t9:
+; CHECK: movi.16b v0, #0x4b
+ret <16 x i8> <i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75,
+ i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75>
+}
+
+define <2 x i64> @movi_2d_imm_t10() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_2d_imm_t10:
+; CHECK: movi.2d v0, #0xff00ff00ff00ff
+ret <2 x i64> <i64 71777214294589695, i64 71777214294589695>
+}
+
+define <4 x i32> @movi_4s_imm_t11() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_4s_imm_t11:
+; CHECK: fmov.4s v0, #-0.32812500
+ret <4 x i32> <i32 3198681088, i32 3198681088, i32 3198681088, i32 3198681088>
+}
+
+define <2 x i64> @movi_2d_imm_t12() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_2d_imm_t12:
+; CHECK: fmov.2d v0, #-0.17187500
+ret <2 x i64> <i64 13818732506632945664, i64 13818732506632945664>
+}
diff --git a/test/CodeGen/AArch64/arm64-vector-insertion.ll b/test/CodeGen/AArch64/arm64-vector-insertion.ll
new file mode 100644
index 0000000..8fbff71
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vector-insertion.ll
@@ -0,0 +1,33 @@
+; RUN: llc -march=arm64 -mcpu=generic -aarch64-neon-syntax=apple < %s | FileCheck %s
+
+define void @test0f(float* nocapture %x, float %a) #0 {
+entry:
+ %0 = insertelement <4 x float> <float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %a, i32 0
+ %1 = bitcast float* %x to <4 x float>*
+ store <4 x float> %0, <4 x float>* %1, align 16
+ ret void
+
+ ; CHECK-LABEL: test0f
+ ; CHECK: movi.2d v[[TEMP:[0-9]+]], #0000000000000000
+ ; CHECK: ins.s v[[TEMP]][0], v{{[0-9]+}}[0]
+ ; CHECK: str q[[TEMP]], [x0]
+ ; CHECK: ret
+
+
+}
+
+
+define void @test1f(float* nocapture %x, float %a) #0 {
+entry:
+ %0 = insertelement <4 x float> <float undef, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, float %a, i32 0
+ %1 = bitcast float* %x to <4 x float>*
+ store <4 x float> %0, <4 x float>* %1, align 16
+ ret void
+
+ ; CHECK-LABEL: test1f
+ ; CHECK: fmov s[[TEMP:[0-9]+]], #1.0000000
+ ; CHECK: dup.4s v[[TEMP2:[0-9]+]], v[[TEMP]][0]
+ ; CHECK: ins.s v[[TEMP2]][0], v0[0]
+ ; CHECK: str q[[TEMP2]], [x0]
+ ; CHECK: ret
+}
diff --git a/test/CodeGen/AArch64/arm64-vector-ldst.ll b/test/CodeGen/AArch64/arm64-vector-ldst.ll
new file mode 100644
index 0000000..c001915
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vector-ldst.ll
@@ -0,0 +1,601 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
+
+; rdar://9428579
+
+%type1 = type { <16 x i8> }
+%type2 = type { <8 x i8> }
+%type3 = type { <4 x i16> }
+
+
+define hidden fastcc void @t1(%type1** %argtable) nounwind {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: ldr x[[REG:[0-9]+]], [x0]
+; CHECK: str q0, [x[[REG]]]
+ %tmp1 = load %type1** %argtable, align 8
+ %tmp2 = getelementptr inbounds %type1* %tmp1, i64 0, i32 0
+ store <16 x i8> zeroinitializer, <16 x i8>* %tmp2, align 16
+ ret void
+}
+
+define hidden fastcc void @t2(%type2** %argtable) nounwind {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: ldr x[[REG:[0-9]+]], [x0]
+; CHECK: str d0, [x[[REG]]]
+ %tmp1 = load %type2** %argtable, align 8
+ %tmp2 = getelementptr inbounds %type2* %tmp1, i64 0, i32 0
+ store <8 x i8> zeroinitializer, <8 x i8>* %tmp2, align 8
+ ret void
+}
+
+; add a bunch of tests for rdar://11246289
+
+@globalArray64x2 = common global <2 x i64>* null, align 8
+@globalArray32x4 = common global <4 x i32>* null, align 8
+@globalArray16x8 = common global <8 x i16>* null, align 8
+@globalArray8x16 = common global <16 x i8>* null, align 8
+@globalArray64x1 = common global <1 x i64>* null, align 8
+@globalArray32x2 = common global <2 x i32>* null, align 8
+@globalArray16x4 = common global <4 x i16>* null, align 8
+@globalArray8x8 = common global <8 x i8>* null, align 8
+@floatglobalArray64x2 = common global <2 x double>* null, align 8
+@floatglobalArray32x4 = common global <4 x float>* null, align 8
+@floatglobalArray64x1 = common global <1 x double>* null, align 8
+@floatglobalArray32x2 = common global <2 x float>* null, align 8
+
+define void @fct1_64x2(<2 x i64>* nocapture %array, i64 %offset) nounwind ssp {
+entry:
+; CHECK-LABEL: fct1_64x2:
+; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #4
+; CHECK: ldr [[DEST:q[0-9]+]], [x0, [[SHIFTEDOFFSET]]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
+ %arrayidx = getelementptr inbounds <2 x i64>* %array, i64 %offset
+ %tmp = load <2 x i64>* %arrayidx, align 16
+ %tmp1 = load <2 x i64>** @globalArray64x2, align 8
+ %arrayidx1 = getelementptr inbounds <2 x i64>* %tmp1, i64 %offset
+ store <2 x i64> %tmp, <2 x i64>* %arrayidx1, align 16
+ ret void
+}
+
+define void @fct2_64x2(<2 x i64>* nocapture %array) nounwind ssp {
+entry:
+; CHECK-LABEL: fct2_64x2:
+; CHECK: ldr [[DEST:q[0-9]+]], [x0, #48]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], #80]
+ %arrayidx = getelementptr inbounds <2 x i64>* %array, i64 3
+ %tmp = load <2 x i64>* %arrayidx, align 16
+ %tmp1 = load <2 x i64>** @globalArray64x2, align 8
+ %arrayidx1 = getelementptr inbounds <2 x i64>* %tmp1, i64 5
+ store <2 x i64> %tmp, <2 x i64>* %arrayidx1, align 16
+ ret void
+}
+
+define void @fct1_32x4(<4 x i32>* nocapture %array, i64 %offset) nounwind ssp {
+entry:
+; CHECK-LABEL: fct1_32x4:
+; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #4
+; CHECK: ldr [[DEST:q[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
+ %arrayidx = getelementptr inbounds <4 x i32>* %array, i64 %offset
+ %tmp = load <4 x i32>* %arrayidx, align 16
+ %tmp1 = load <4 x i32>** @globalArray32x4, align 8
+ %arrayidx1 = getelementptr inbounds <4 x i32>* %tmp1, i64 %offset
+ store <4 x i32> %tmp, <4 x i32>* %arrayidx1, align 16
+ ret void
+}
+
+define void @fct2_32x4(<4 x i32>* nocapture %array) nounwind ssp {
+entry:
+; CHECK-LABEL: fct2_32x4:
+; CHECK: ldr [[DEST:q[0-9]+]], [x0, #48]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], #80]
+ %arrayidx = getelementptr inbounds <4 x i32>* %array, i64 3
+ %tmp = load <4 x i32>* %arrayidx, align 16
+ %tmp1 = load <4 x i32>** @globalArray32x4, align 8
+ %arrayidx1 = getelementptr inbounds <4 x i32>* %tmp1, i64 5
+ store <4 x i32> %tmp, <4 x i32>* %arrayidx1, align 16
+ ret void
+}
+
+define void @fct1_16x8(<8 x i16>* nocapture %array, i64 %offset) nounwind ssp {
+entry:
+; CHECK-LABEL: fct1_16x8:
+; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #4
+; CHECK: ldr [[DEST:q[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
+ %arrayidx = getelementptr inbounds <8 x i16>* %array, i64 %offset
+ %tmp = load <8 x i16>* %arrayidx, align 16
+ %tmp1 = load <8 x i16>** @globalArray16x8, align 8
+ %arrayidx1 = getelementptr inbounds <8 x i16>* %tmp1, i64 %offset
+ store <8 x i16> %tmp, <8 x i16>* %arrayidx1, align 16
+ ret void
+}
+
+define void @fct2_16x8(<8 x i16>* nocapture %array) nounwind ssp {
+entry:
+; CHECK-LABEL: fct2_16x8:
+; CHECK: ldr [[DEST:q[0-9]+]], [x0, #48]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], #80]
+ %arrayidx = getelementptr inbounds <8 x i16>* %array, i64 3
+ %tmp = load <8 x i16>* %arrayidx, align 16
+ %tmp1 = load <8 x i16>** @globalArray16x8, align 8
+ %arrayidx1 = getelementptr inbounds <8 x i16>* %tmp1, i64 5
+ store <8 x i16> %tmp, <8 x i16>* %arrayidx1, align 16
+ ret void
+}
+
+define void @fct1_8x16(<16 x i8>* nocapture %array, i64 %offset) nounwind ssp {
+entry:
+; CHECK-LABEL: fct1_8x16:
+; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #4
+; CHECK: ldr [[DEST:q[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
+ %arrayidx = getelementptr inbounds <16 x i8>* %array, i64 %offset
+ %tmp = load <16 x i8>* %arrayidx, align 16
+ %tmp1 = load <16 x i8>** @globalArray8x16, align 8
+ %arrayidx1 = getelementptr inbounds <16 x i8>* %tmp1, i64 %offset
+ store <16 x i8> %tmp, <16 x i8>* %arrayidx1, align 16
+ ret void
+}
+
+define void @fct2_8x16(<16 x i8>* nocapture %array) nounwind ssp {
+entry:
+; CHECK-LABEL: fct2_8x16:
+; CHECK: ldr [[DEST:q[0-9]+]], [x0, #48]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], #80]
+ %arrayidx = getelementptr inbounds <16 x i8>* %array, i64 3
+ %tmp = load <16 x i8>* %arrayidx, align 16
+ %tmp1 = load <16 x i8>** @globalArray8x16, align 8
+ %arrayidx1 = getelementptr inbounds <16 x i8>* %tmp1, i64 5
+ store <16 x i8> %tmp, <16 x i8>* %arrayidx1, align 16
+ ret void
+}
+
+define void @fct1_64x1(<1 x i64>* nocapture %array, i64 %offset) nounwind ssp {
+entry:
+; CHECK-LABEL: fct1_64x1:
+; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #3
+; CHECK: ldr [[DEST:d[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
+ %arrayidx = getelementptr inbounds <1 x i64>* %array, i64 %offset
+ %tmp = load <1 x i64>* %arrayidx, align 8
+ %tmp1 = load <1 x i64>** @globalArray64x1, align 8
+ %arrayidx1 = getelementptr inbounds <1 x i64>* %tmp1, i64 %offset
+ store <1 x i64> %tmp, <1 x i64>* %arrayidx1, align 8
+ ret void
+}
+
+define void @fct2_64x1(<1 x i64>* nocapture %array) nounwind ssp {
+entry:
+; CHECK-LABEL: fct2_64x1:
+; CHECK: ldr [[DEST:d[0-9]+]], [x0, #24]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], #40]
+ %arrayidx = getelementptr inbounds <1 x i64>* %array, i64 3
+ %tmp = load <1 x i64>* %arrayidx, align 8
+ %tmp1 = load <1 x i64>** @globalArray64x1, align 8
+ %arrayidx1 = getelementptr inbounds <1 x i64>* %tmp1, i64 5
+ store <1 x i64> %tmp, <1 x i64>* %arrayidx1, align 8
+ ret void
+}
+
+define void @fct1_32x2(<2 x i32>* nocapture %array, i64 %offset) nounwind ssp {
+entry:
+; CHECK-LABEL: fct1_32x2:
+; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #3
+; CHECK: ldr [[DEST:d[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
+ %arrayidx = getelementptr inbounds <2 x i32>* %array, i64 %offset
+ %tmp = load <2 x i32>* %arrayidx, align 8
+ %tmp1 = load <2 x i32>** @globalArray32x2, align 8
+ %arrayidx1 = getelementptr inbounds <2 x i32>* %tmp1, i64 %offset
+ store <2 x i32> %tmp, <2 x i32>* %arrayidx1, align 8
+ ret void
+}
+
+define void @fct2_32x2(<2 x i32>* nocapture %array) nounwind ssp {
+entry:
+; CHECK-LABEL: fct2_32x2:
+; CHECK: ldr [[DEST:d[0-9]+]], [x0, #24]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], #40]
+ %arrayidx = getelementptr inbounds <2 x i32>* %array, i64 3
+ %tmp = load <2 x i32>* %arrayidx, align 8
+ %tmp1 = load <2 x i32>** @globalArray32x2, align 8
+ %arrayidx1 = getelementptr inbounds <2 x i32>* %tmp1, i64 5
+ store <2 x i32> %tmp, <2 x i32>* %arrayidx1, align 8
+ ret void
+}
+
+define void @fct1_16x4(<4 x i16>* nocapture %array, i64 %offset) nounwind ssp {
+entry:
+; CHECK-LABEL: fct1_16x4:
+; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #3
+; CHECK: ldr [[DEST:d[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
+ %arrayidx = getelementptr inbounds <4 x i16>* %array, i64 %offset
+ %tmp = load <4 x i16>* %arrayidx, align 8
+ %tmp1 = load <4 x i16>** @globalArray16x4, align 8
+ %arrayidx1 = getelementptr inbounds <4 x i16>* %tmp1, i64 %offset
+ store <4 x i16> %tmp, <4 x i16>* %arrayidx1, align 8
+ ret void
+}
+
+define void @fct2_16x4(<4 x i16>* nocapture %array) nounwind ssp {
+entry:
+; CHECK-LABEL: fct2_16x4:
+; CHECK: ldr [[DEST:d[0-9]+]], [x0, #24]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], #40]
+ %arrayidx = getelementptr inbounds <4 x i16>* %array, i64 3
+ %tmp = load <4 x i16>* %arrayidx, align 8
+ %tmp1 = load <4 x i16>** @globalArray16x4, align 8
+ %arrayidx1 = getelementptr inbounds <4 x i16>* %tmp1, i64 5
+ store <4 x i16> %tmp, <4 x i16>* %arrayidx1, align 8
+ ret void
+}
+
+define void @fct1_8x8(<8 x i8>* nocapture %array, i64 %offset) nounwind ssp {
+entry:
+; CHECK-LABEL: fct1_8x8:
+; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #3
+; CHECK: ldr [[DEST:d[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
+ %arrayidx = getelementptr inbounds <8 x i8>* %array, i64 %offset
+ %tmp = load <8 x i8>* %arrayidx, align 8
+ %tmp1 = load <8 x i8>** @globalArray8x8, align 8
+ %arrayidx1 = getelementptr inbounds <8 x i8>* %tmp1, i64 %offset
+ store <8 x i8> %tmp, <8 x i8>* %arrayidx1, align 8
+ ret void
+}
+
+; Add a bunch of tests for rdar://13258794: Match LDUR/STUR for D and Q
+; registers for unscaled vector accesses
+@str = global [63 x i8] c"Test case for rdar://13258794: LDUR/STUR for D and Q registers\00", align 1
+
+define <1 x i64> @fct0() nounwind readonly ssp {
+entry:
+; CHECK-LABEL: fct0:
+; CHECK: ldur {{d[0-9]+}}, [{{x[0-9]+}}, #3]
+ %0 = load <1 x i64>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <1 x i64>*), align 8
+ ret <1 x i64> %0
+}
+
+define <2 x i32> @fct1() nounwind readonly ssp {
+entry:
+; CHECK-LABEL: fct1:
+; CHECK: ldur {{d[0-9]+}}, [{{x[0-9]+}}, #3]
+ %0 = load <2 x i32>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <2 x i32>*), align 8
+ ret <2 x i32> %0
+}
+
+define <4 x i16> @fct2() nounwind readonly ssp {
+entry:
+; CHECK-LABEL: fct2:
+; CHECK: ldur {{d[0-9]+}}, [{{x[0-9]+}}, #3]
+ %0 = load <4 x i16>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <4 x i16>*), align 8
+ ret <4 x i16> %0
+}
+
+define <8 x i8> @fct3() nounwind readonly ssp {
+entry:
+; CHECK-LABEL: fct3:
+; CHECK: ldur {{d[0-9]+}}, [{{x[0-9]+}}, #3]
+ %0 = load <8 x i8>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <8 x i8>*), align 8
+ ret <8 x i8> %0
+}
+
+define <2 x i64> @fct4() nounwind readonly ssp {
+entry:
+; CHECK-LABEL: fct4:
+; CHECK: ldur {{q[0-9]+}}, [{{x[0-9]+}}, #3]
+ %0 = load <2 x i64>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <2 x i64>*), align 16
+ ret <2 x i64> %0
+}
+
+define <4 x i32> @fct5() nounwind readonly ssp {
+entry:
+; CHECK-LABEL: fct5:
+; CHECK: ldur {{q[0-9]+}}, [{{x[0-9]+}}, #3]
+ %0 = load <4 x i32>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <4 x i32>*), align 16
+ ret <4 x i32> %0
+}
+
+define <8 x i16> @fct6() nounwind readonly ssp {
+entry:
+; CHECK-LABEL: fct6:
+; CHECK: ldur {{q[0-9]+}}, [{{x[0-9]+}}, #3]
+ %0 = load <8 x i16>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <8 x i16>*), align 16
+ ret <8 x i16> %0
+}
+
+define <16 x i8> @fct7() nounwind readonly ssp {
+entry:
+; CHECK-LABEL: fct7:
+; CHECK: ldur {{q[0-9]+}}, [{{x[0-9]+}}, #3]
+ %0 = load <16 x i8>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <16 x i8>*), align 16
+ ret <16 x i8> %0
+}
+
+define void @fct8() nounwind ssp {
+entry:
+; CHECK-LABEL: fct8:
+; CHECK: ldur [[DESTREG:d[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
+; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
+ %0 = load <1 x i64>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <1 x i64>*), align 8
+ store <1 x i64> %0, <1 x i64>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <1 x i64>*), align 8
+ ret void
+}
+
+define void @fct9() nounwind ssp {
+entry:
+; CHECK-LABEL: fct9:
+; CHECK: ldur [[DESTREG:d[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
+; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
+ %0 = load <2 x i32>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <2 x i32>*), align 8
+ store <2 x i32> %0, <2 x i32>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <2 x i32>*), align 8
+ ret void
+}
+
+define void @fct10() nounwind ssp {
+entry:
+; CHECK-LABEL: fct10:
+; CHECK: ldur [[DESTREG:d[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
+; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
+ %0 = load <4 x i16>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <4 x i16>*), align 8
+ store <4 x i16> %0, <4 x i16>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <4 x i16>*), align 8
+ ret void
+}
+
+define void @fct11() nounwind ssp {
+entry:
+; CHECK-LABEL: fct11:
+; CHECK: ldur [[DESTREG:d[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
+; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
+ %0 = load <8 x i8>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <8 x i8>*), align 8
+ store <8 x i8> %0, <8 x i8>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <8 x i8>*), align 8
+ ret void
+}
+
+define void @fct12() nounwind ssp {
+entry:
+; CHECK-LABEL: fct12:
+; CHECK: ldur [[DESTREG:q[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
+; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
+ %0 = load <2 x i64>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <2 x i64>*), align 16
+ store <2 x i64> %0, <2 x i64>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <2 x i64>*), align 16
+ ret void
+}
+
+define void @fct13() nounwind ssp {
+entry:
+; CHECK-LABEL: fct13:
+; CHECK: ldur [[DESTREG:q[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
+; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
+ %0 = load <4 x i32>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <4 x i32>*), align 16
+ store <4 x i32> %0, <4 x i32>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <4 x i32>*), align 16
+ ret void
+}
+
+define void @fct14() nounwind ssp {
+entry:
+; CHECK-LABEL: fct14:
+; CHECK: ldur [[DESTREG:q[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
+; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
+ %0 = load <8 x i16>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <8 x i16>*), align 16
+ store <8 x i16> %0, <8 x i16>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <8 x i16>*), align 16
+ ret void
+}
+
+define void @fct15() nounwind ssp {
+entry:
+; CHECK-LABEL: fct15:
+; CHECK: ldur [[DESTREG:q[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
+; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
+ %0 = load <16 x i8>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <16 x i8>*), align 16
+ store <16 x i8> %0, <16 x i8>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <16 x i8>*), align 16
+ ret void
+}
+
+; Check the building of vector from a single loaded value.
+; Part of <rdar://problem/14170854>
+;
+; Single loads with immediate offset.
+define <8 x i8> @fct16(i8* nocapture %sp0) {
+; CHECK-LABEL: fct16:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: mul.8b v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+ %addr = getelementptr i8* %sp0, i64 1
+ %pix_sp0.0.copyload = load i8* %addr, align 1
+ %vec = insertelement <8 x i8> undef, i8 %pix_sp0.0.copyload, i32 0
+ %vmull.i = mul <8 x i8> %vec, %vec
+ ret <8 x i8> %vmull.i
+}
+
+define <16 x i8> @fct17(i8* nocapture %sp0) {
+; CHECK-LABEL: fct17:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: mul.16b v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+ %addr = getelementptr i8* %sp0, i64 1
+ %pix_sp0.0.copyload = load i8* %addr, align 1
+ %vec = insertelement <16 x i8> undef, i8 %pix_sp0.0.copyload, i32 0
+ %vmull.i = mul <16 x i8> %vec, %vec
+ ret <16 x i8> %vmull.i
+}
+
+define <4 x i16> @fct18(i16* nocapture %sp0) {
+; CHECK-LABEL: fct18:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, #2]
+; CHECK-NEXT: mul.4h v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+ %addr = getelementptr i16* %sp0, i64 1
+ %pix_sp0.0.copyload = load i16* %addr, align 1
+ %vec = insertelement <4 x i16> undef, i16 %pix_sp0.0.copyload, i32 0
+ %vmull.i = mul <4 x i16> %vec, %vec
+ ret <4 x i16> %vmull.i
+}
+
+define <8 x i16> @fct19(i16* nocapture %sp0) {
+; CHECK-LABEL: fct19:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, #2]
+; CHECK-NEXT: mul.8h v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+ %addr = getelementptr i16* %sp0, i64 1
+ %pix_sp0.0.copyload = load i16* %addr, align 1
+ %vec = insertelement <8 x i16> undef, i16 %pix_sp0.0.copyload, i32 0
+ %vmull.i = mul <8 x i16> %vec, %vec
+ ret <8 x i16> %vmull.i
+}
+
+define <2 x i32> @fct20(i32* nocapture %sp0) {
+; CHECK-LABEL: fct20:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, #4]
+; CHECK-NEXT: mul.2s v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+ %addr = getelementptr i32* %sp0, i64 1
+ %pix_sp0.0.copyload = load i32* %addr, align 1
+ %vec = insertelement <2 x i32> undef, i32 %pix_sp0.0.copyload, i32 0
+ %vmull.i = mul <2 x i32> %vec, %vec
+ ret <2 x i32> %vmull.i
+}
+
+define <4 x i32> @fct21(i32* nocapture %sp0) {
+; CHECK-LABEL: fct21:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, #4]
+; CHECK-NEXT: mul.4s v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+ %addr = getelementptr i32* %sp0, i64 1
+ %pix_sp0.0.copyload = load i32* %addr, align 1
+ %vec = insertelement <4 x i32> undef, i32 %pix_sp0.0.copyload, i32 0
+ %vmull.i = mul <4 x i32> %vec, %vec
+ ret <4 x i32> %vmull.i
+}
+
+define <1 x i64> @fct22(i64* nocapture %sp0) {
+; CHECK-LABEL: fct22:
+; CHECK: ldr d0, [x0, #8]
+entry:
+ %addr = getelementptr i64* %sp0, i64 1
+ %pix_sp0.0.copyload = load i64* %addr, align 1
+ %vec = insertelement <1 x i64> undef, i64 %pix_sp0.0.copyload, i32 0
+ ret <1 x i64> %vec
+}
+
+define <2 x i64> @fct23(i64* nocapture %sp0) {
+; CHECK-LABEL: fct23:
+; CHECK: ldr d[[REGNUM:[0-9]+]], [x0, #8]
+entry:
+ %addr = getelementptr i64* %sp0, i64 1
+ %pix_sp0.0.copyload = load i64* %addr, align 1
+ %vec = insertelement <2 x i64> undef, i64 %pix_sp0.0.copyload, i32 0
+ ret <2 x i64> %vec
+}
+
+;
+; Single loads with register offset.
+define <8 x i8> @fct24(i8* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct24:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, x1]
+; CHECK-NEXT: mul.8b v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+ %addr = getelementptr i8* %sp0, i64 %offset
+ %pix_sp0.0.copyload = load i8* %addr, align 1
+ %vec = insertelement <8 x i8> undef, i8 %pix_sp0.0.copyload, i32 0
+ %vmull.i = mul <8 x i8> %vec, %vec
+ ret <8 x i8> %vmull.i
+}
+
+define <16 x i8> @fct25(i8* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct25:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, x1]
+; CHECK-NEXT: mul.16b v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+ %addr = getelementptr i8* %sp0, i64 %offset
+ %pix_sp0.0.copyload = load i8* %addr, align 1
+ %vec = insertelement <16 x i8> undef, i8 %pix_sp0.0.copyload, i32 0
+ %vmull.i = mul <16 x i8> %vec, %vec
+ ret <16 x i8> %vmull.i
+}
+
+define <4 x i16> @fct26(i16* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct26:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, x1, lsl #1]
+; CHECK-NEXT: mul.4h v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+ %addr = getelementptr i16* %sp0, i64 %offset
+ %pix_sp0.0.copyload = load i16* %addr, align 1
+ %vec = insertelement <4 x i16> undef, i16 %pix_sp0.0.copyload, i32 0
+ %vmull.i = mul <4 x i16> %vec, %vec
+ ret <4 x i16> %vmull.i
+}
+
+define <8 x i16> @fct27(i16* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct27:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, x1, lsl #1]
+; CHECK-NEXT: mul.8h v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+ %addr = getelementptr i16* %sp0, i64 %offset
+ %pix_sp0.0.copyload = load i16* %addr, align 1
+ %vec = insertelement <8 x i16> undef, i16 %pix_sp0.0.copyload, i32 0
+ %vmull.i = mul <8 x i16> %vec, %vec
+ ret <8 x i16> %vmull.i
+}
+
+define <2 x i32> @fct28(i32* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct28:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, x1, lsl #2]
+; CHECK-NEXT: mul.2s v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+ %addr = getelementptr i32* %sp0, i64 %offset
+ %pix_sp0.0.copyload = load i32* %addr, align 1
+ %vec = insertelement <2 x i32> undef, i32 %pix_sp0.0.copyload, i32 0
+ %vmull.i = mul <2 x i32> %vec, %vec
+ ret <2 x i32> %vmull.i
+}
+
+define <4 x i32> @fct29(i32* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct29:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, x1, lsl #2]
+; CHECK-NEXT: mul.4s v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+ %addr = getelementptr i32* %sp0, i64 %offset
+ %pix_sp0.0.copyload = load i32* %addr, align 1
+ %vec = insertelement <4 x i32> undef, i32 %pix_sp0.0.copyload, i32 0
+ %vmull.i = mul <4 x i32> %vec, %vec
+ ret <4 x i32> %vmull.i
+}
+
+define <1 x i64> @fct30(i64* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct30:
+; CHECK: ldr d0, [x0, x1, lsl #3]
+entry:
+ %addr = getelementptr i64* %sp0, i64 %offset
+ %pix_sp0.0.copyload = load i64* %addr, align 1
+ %vec = insertelement <1 x i64> undef, i64 %pix_sp0.0.copyload, i32 0
+ ret <1 x i64> %vec
+}
+
+define <2 x i64> @fct31(i64* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct31:
+; CHECK: ldr d0, [x0, x1, lsl #3]
+entry:
+ %addr = getelementptr i64* %sp0, i64 %offset
+ %pix_sp0.0.copyload = load i64* %addr, align 1
+ %vec = insertelement <2 x i64> undef, i64 %pix_sp0.0.copyload, i32 0
+ ret <2 x i64> %vec
+}
diff --git a/test/CodeGen/AArch64/arm64-vext.ll b/test/CodeGen/AArch64/arm64-vext.ll
new file mode 100644
index 0000000..2240dfd
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vext.ll
@@ -0,0 +1,464 @@
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s
+
+define void @test_vext_s8() nounwind ssp {
+ ; CHECK-LABEL: test_vext_s8:
+ ; CHECK: {{ext.8.*#1}}
+ %xS8x8 = alloca <8 x i8>, align 8
+ %__a = alloca <8 x i8>, align 8
+ %__b = alloca <8 x i8>, align 8
+ %tmp = load <8 x i8>* %xS8x8, align 8
+ store <8 x i8> %tmp, <8 x i8>* %__a, align 8
+ %tmp1 = load <8 x i8>* %xS8x8, align 8
+ store <8 x i8> %tmp1, <8 x i8>* %__b, align 8
+ %tmp2 = load <8 x i8>* %__a, align 8
+ %tmp3 = load <8 x i8>* %__b, align 8
+ %vext = shufflevector <8 x i8> %tmp2, <8 x i8> %tmp3, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+ store <8 x i8> %vext, <8 x i8>* %xS8x8, align 8
+ ret void
+}
+
+define void @test_vext_u8() nounwind ssp {
+ ; CHECK-LABEL: test_vext_u8:
+ ; CHECK: {{ext.8.*#2}}
+ %xU8x8 = alloca <8 x i8>, align 8
+ %__a = alloca <8 x i8>, align 8
+ %__b = alloca <8 x i8>, align 8
+ %tmp = load <8 x i8>* %xU8x8, align 8
+ store <8 x i8> %tmp, <8 x i8>* %__a, align 8
+ %tmp1 = load <8 x i8>* %xU8x8, align 8
+ store <8 x i8> %tmp1, <8 x i8>* %__b, align 8
+ %tmp2 = load <8 x i8>* %__a, align 8
+ %tmp3 = load <8 x i8>* %__b, align 8
+ %vext = shufflevector <8 x i8> %tmp2, <8 x i8> %tmp3, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+ store <8 x i8> %vext, <8 x i8>* %xU8x8, align 8
+ ret void
+}
+
+define void @test_vext_p8() nounwind ssp {
+ ; CHECK-LABEL: test_vext_p8:
+ ; CHECK: {{ext.8.*#3}}
+ %xP8x8 = alloca <8 x i8>, align 8
+ %__a = alloca <8 x i8>, align 8
+ %__b = alloca <8 x i8>, align 8
+ %tmp = load <8 x i8>* %xP8x8, align 8
+ store <8 x i8> %tmp, <8 x i8>* %__a, align 8
+ %tmp1 = load <8 x i8>* %xP8x8, align 8
+ store <8 x i8> %tmp1, <8 x i8>* %__b, align 8
+ %tmp2 = load <8 x i8>* %__a, align 8
+ %tmp3 = load <8 x i8>* %__b, align 8
+ %vext = shufflevector <8 x i8> %tmp2, <8 x i8> %tmp3, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+ store <8 x i8> %vext, <8 x i8>* %xP8x8, align 8
+ ret void
+}
+
+define void @test_vext_s16() nounwind ssp {
+ ; CHECK-LABEL: test_vext_s16:
+ ; CHECK: {{ext.8.*#2}}
+ %xS16x4 = alloca <4 x i16>, align 8
+ %__a = alloca <4 x i16>, align 8
+ %__b = alloca <4 x i16>, align 8
+ %tmp = load <4 x i16>* %xS16x4, align 8
+ store <4 x i16> %tmp, <4 x i16>* %__a, align 8
+ %tmp1 = load <4 x i16>* %xS16x4, align 8
+ store <4 x i16> %tmp1, <4 x i16>* %__b, align 8
+ %tmp2 = load <4 x i16>* %__a, align 8
+ %tmp3 = bitcast <4 x i16> %tmp2 to <8 x i8>
+ %tmp4 = load <4 x i16>* %__b, align 8
+ %tmp5 = bitcast <4 x i16> %tmp4 to <8 x i8>
+ %tmp6 = bitcast <8 x i8> %tmp3 to <4 x i16>
+ %tmp7 = bitcast <8 x i8> %tmp5 to <4 x i16>
+ %vext = shufflevector <4 x i16> %tmp6, <4 x i16> %tmp7, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+ store <4 x i16> %vext, <4 x i16>* %xS16x4, align 8
+ ret void
+}
+
+define void @test_vext_u16() nounwind ssp {
+ ; CHECK-LABEL: test_vext_u16:
+ ; CHECK: {{ext.8.*#4}}
+ %xU16x4 = alloca <4 x i16>, align 8
+ %__a = alloca <4 x i16>, align 8
+ %__b = alloca <4 x i16>, align 8
+ %tmp = load <4 x i16>* %xU16x4, align 8
+ store <4 x i16> %tmp, <4 x i16>* %__a, align 8
+ %tmp1 = load <4 x i16>* %xU16x4, align 8
+ store <4 x i16> %tmp1, <4 x i16>* %__b, align 8
+ %tmp2 = load <4 x i16>* %__a, align 8
+ %tmp3 = bitcast <4 x i16> %tmp2 to <8 x i8>
+ %tmp4 = load <4 x i16>* %__b, align 8
+ %tmp5 = bitcast <4 x i16> %tmp4 to <8 x i8>
+ %tmp6 = bitcast <8 x i8> %tmp3 to <4 x i16>
+ %tmp7 = bitcast <8 x i8> %tmp5 to <4 x i16>
+ %vext = shufflevector <4 x i16> %tmp6, <4 x i16> %tmp7, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ store <4 x i16> %vext, <4 x i16>* %xU16x4, align 8
+ ret void
+}
+
+define void @test_vext_p16() nounwind ssp {
+ ; CHECK-LABEL: test_vext_p16:
+ ; CHECK: {{ext.8.*#6}}
+ %xP16x4 = alloca <4 x i16>, align 8
+ %__a = alloca <4 x i16>, align 8
+ %__b = alloca <4 x i16>, align 8
+ %tmp = load <4 x i16>* %xP16x4, align 8
+ store <4 x i16> %tmp, <4 x i16>* %__a, align 8
+ %tmp1 = load <4 x i16>* %xP16x4, align 8
+ store <4 x i16> %tmp1, <4 x i16>* %__b, align 8
+ %tmp2 = load <4 x i16>* %__a, align 8
+ %tmp3 = bitcast <4 x i16> %tmp2 to <8 x i8>
+ %tmp4 = load <4 x i16>* %__b, align 8
+ %tmp5 = bitcast <4 x i16> %tmp4 to <8 x i8>
+ %tmp6 = bitcast <8 x i8> %tmp3 to <4 x i16>
+ %tmp7 = bitcast <8 x i8> %tmp5 to <4 x i16>
+ %vext = shufflevector <4 x i16> %tmp6, <4 x i16> %tmp7, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+ store <4 x i16> %vext, <4 x i16>* %xP16x4, align 8
+ ret void
+}
+
+define void @test_vext_s32() nounwind ssp {
+ ; CHECK-LABEL: test_vext_s32:
+ ; CHECK: {{ext.8.*#4}}
+ %xS32x2 = alloca <2 x i32>, align 8
+ %__a = alloca <2 x i32>, align 8
+ %__b = alloca <2 x i32>, align 8
+ %tmp = load <2 x i32>* %xS32x2, align 8
+ store <2 x i32> %tmp, <2 x i32>* %__a, align 8
+ %tmp1 = load <2 x i32>* %xS32x2, align 8
+ store <2 x i32> %tmp1, <2 x i32>* %__b, align 8
+ %tmp2 = load <2 x i32>* %__a, align 8
+ %tmp3 = bitcast <2 x i32> %tmp2 to <8 x i8>
+ %tmp4 = load <2 x i32>* %__b, align 8
+ %tmp5 = bitcast <2 x i32> %tmp4 to <8 x i8>
+ %tmp6 = bitcast <8 x i8> %tmp3 to <2 x i32>
+ %tmp7 = bitcast <8 x i8> %tmp5 to <2 x i32>
+ %vext = shufflevector <2 x i32> %tmp6, <2 x i32> %tmp7, <2 x i32> <i32 1, i32 2>
+ store <2 x i32> %vext, <2 x i32>* %xS32x2, align 8
+ ret void
+}
+
+define void @test_vext_u32() nounwind ssp {
+ ; CHECK-LABEL: test_vext_u32:
+ ; CHECK: {{ext.8.*#4}}
+ %xU32x2 = alloca <2 x i32>, align 8
+ %__a = alloca <2 x i32>, align 8
+ %__b = alloca <2 x i32>, align 8
+ %tmp = load <2 x i32>* %xU32x2, align 8
+ store <2 x i32> %tmp, <2 x i32>* %__a, align 8
+ %tmp1 = load <2 x i32>* %xU32x2, align 8
+ store <2 x i32> %tmp1, <2 x i32>* %__b, align 8
+ %tmp2 = load <2 x i32>* %__a, align 8
+ %tmp3 = bitcast <2 x i32> %tmp2 to <8 x i8>
+ %tmp4 = load <2 x i32>* %__b, align 8
+ %tmp5 = bitcast <2 x i32> %tmp4 to <8 x i8>
+ %tmp6 = bitcast <8 x i8> %tmp3 to <2 x i32>
+ %tmp7 = bitcast <8 x i8> %tmp5 to <2 x i32>
+ %vext = shufflevector <2 x i32> %tmp6, <2 x i32> %tmp7, <2 x i32> <i32 1, i32 2>
+ store <2 x i32> %vext, <2 x i32>* %xU32x2, align 8
+ ret void
+}
+
+define void @test_vext_f32() nounwind ssp {
+ ; CHECK-LABEL: test_vext_f32:
+ ; CHECK: {{ext.8.*#4}}
+ %xF32x2 = alloca <2 x float>, align 8
+ %__a = alloca <2 x float>, align 8
+ %__b = alloca <2 x float>, align 8
+ %tmp = load <2 x float>* %xF32x2, align 8
+ store <2 x float> %tmp, <2 x float>* %__a, align 8
+ %tmp1 = load <2 x float>* %xF32x2, align 8
+ store <2 x float> %tmp1, <2 x float>* %__b, align 8
+ %tmp2 = load <2 x float>* %__a, align 8
+ %tmp3 = bitcast <2 x float> %tmp2 to <8 x i8>
+ %tmp4 = load <2 x float>* %__b, align 8
+ %tmp5 = bitcast <2 x float> %tmp4 to <8 x i8>
+ %tmp6 = bitcast <8 x i8> %tmp3 to <2 x float>
+ %tmp7 = bitcast <8 x i8> %tmp5 to <2 x float>
+ %vext = shufflevector <2 x float> %tmp6, <2 x float> %tmp7, <2 x i32> <i32 1, i32 2>
+ store <2 x float> %vext, <2 x float>* %xF32x2, align 8
+ ret void
+}
+
+define void @test_vext_s64() nounwind ssp {
+ ; CHECK-LABEL: test_vext_s64:
+ ; CHECK_FIXME: {{ext.8.*#1}}
+ ; this just turns into a load of the second element
+ %xS64x1 = alloca <1 x i64>, align 8
+ %__a = alloca <1 x i64>, align 8
+ %__b = alloca <1 x i64>, align 8
+ %tmp = load <1 x i64>* %xS64x1, align 8
+ store <1 x i64> %tmp, <1 x i64>* %__a, align 8
+ %tmp1 = load <1 x i64>* %xS64x1, align 8
+ store <1 x i64> %tmp1, <1 x i64>* %__b, align 8
+ %tmp2 = load <1 x i64>* %__a, align 8
+ %tmp3 = bitcast <1 x i64> %tmp2 to <8 x i8>
+ %tmp4 = load <1 x i64>* %__b, align 8
+ %tmp5 = bitcast <1 x i64> %tmp4 to <8 x i8>
+ %tmp6 = bitcast <8 x i8> %tmp3 to <1 x i64>
+ %tmp7 = bitcast <8 x i8> %tmp5 to <1 x i64>
+ %vext = shufflevector <1 x i64> %tmp6, <1 x i64> %tmp7, <1 x i32> <i32 1>
+ store <1 x i64> %vext, <1 x i64>* %xS64x1, align 8
+ ret void
+}
+
+define void @test_vext_u64() nounwind ssp {
+ ; CHECK-LABEL: test_vext_u64:
+ ; CHECK_FIXME: {{ext.8.*#1}}
+ ; this is turned into a simple load of the 2nd element
+ %xU64x1 = alloca <1 x i64>, align 8
+ %__a = alloca <1 x i64>, align 8
+ %__b = alloca <1 x i64>, align 8
+ %tmp = load <1 x i64>* %xU64x1, align 8
+ store <1 x i64> %tmp, <1 x i64>* %__a, align 8
+ %tmp1 = load <1 x i64>* %xU64x1, align 8
+ store <1 x i64> %tmp1, <1 x i64>* %__b, align 8
+ %tmp2 = load <1 x i64>* %__a, align 8
+ %tmp3 = bitcast <1 x i64> %tmp2 to <8 x i8>
+ %tmp4 = load <1 x i64>* %__b, align 8
+ %tmp5 = bitcast <1 x i64> %tmp4 to <8 x i8>
+ %tmp6 = bitcast <8 x i8> %tmp3 to <1 x i64>
+ %tmp7 = bitcast <8 x i8> %tmp5 to <1 x i64>
+ %vext = shufflevector <1 x i64> %tmp6, <1 x i64> %tmp7, <1 x i32> <i32 1>
+ store <1 x i64> %vext, <1 x i64>* %xU64x1, align 8
+ ret void
+}
+
+define void @test_vextq_s8() nounwind ssp {
+ ; CHECK-LABEL: test_vextq_s8:
+ ; CHECK: {{ext.16.*#4}}
+ %xS8x16 = alloca <16 x i8>, align 16
+ %__a = alloca <16 x i8>, align 16
+ %__b = alloca <16 x i8>, align 16
+ %tmp = load <16 x i8>* %xS8x16, align 16
+ store <16 x i8> %tmp, <16 x i8>* %__a, align 16
+ %tmp1 = load <16 x i8>* %xS8x16, align 16
+ store <16 x i8> %tmp1, <16 x i8>* %__b, align 16
+ %tmp2 = load <16 x i8>* %__a, align 16
+ %tmp3 = load <16 x i8>* %__b, align 16
+ %vext = shufflevector <16 x i8> %tmp2, <16 x i8> %tmp3, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+ store <16 x i8> %vext, <16 x i8>* %xS8x16, align 16
+ ret void
+}
+
+define void @test_vextq_u8() nounwind ssp {
+ ; CHECK-LABEL: test_vextq_u8:
+ ; CHECK: {{ext.16.*#5}}
+ %xU8x16 = alloca <16 x i8>, align 16
+ %__a = alloca <16 x i8>, align 16
+ %__b = alloca <16 x i8>, align 16
+ %tmp = load <16 x i8>* %xU8x16, align 16
+ store <16 x i8> %tmp, <16 x i8>* %__a, align 16
+ %tmp1 = load <16 x i8>* %xU8x16, align 16
+ store <16 x i8> %tmp1, <16 x i8>* %__b, align 16
+ %tmp2 = load <16 x i8>* %__a, align 16
+ %tmp3 = load <16 x i8>* %__b, align 16
+ %vext = shufflevector <16 x i8> %tmp2, <16 x i8> %tmp3, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
+ store <16 x i8> %vext, <16 x i8>* %xU8x16, align 16
+ ret void
+}
+
+define void @test_vextq_p8() nounwind ssp {
+ ; CHECK-LABEL: test_vextq_p8:
+ ; CHECK: {{ext.16.*#6}}
+ %xP8x16 = alloca <16 x i8>, align 16
+ %__a = alloca <16 x i8>, align 16
+ %__b = alloca <16 x i8>, align 16
+ %tmp = load <16 x i8>* %xP8x16, align 16
+ store <16 x i8> %tmp, <16 x i8>* %__a, align 16
+ %tmp1 = load <16 x i8>* %xP8x16, align 16
+ store <16 x i8> %tmp1, <16 x i8>* %__b, align 16
+ %tmp2 = load <16 x i8>* %__a, align 16
+ %tmp3 = load <16 x i8>* %__b, align 16
+ %vext = shufflevector <16 x i8> %tmp2, <16 x i8> %tmp3, <16 x i32> <i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21>
+ store <16 x i8> %vext, <16 x i8>* %xP8x16, align 16
+ ret void
+}
+
+define void @test_vextq_s16() nounwind ssp {
+ ; CHECK-LABEL: test_vextq_s16:
+ ; CHECK: {{ext.16.*#14}}
+ %xS16x8 = alloca <8 x i16>, align 16
+ %__a = alloca <8 x i16>, align 16
+ %__b = alloca <8 x i16>, align 16
+ %tmp = load <8 x i16>* %xS16x8, align 16
+ store <8 x i16> %tmp, <8 x i16>* %__a, align 16
+ %tmp1 = load <8 x i16>* %xS16x8, align 16
+ store <8 x i16> %tmp1, <8 x i16>* %__b, align 16
+ %tmp2 = load <8 x i16>* %__a, align 16
+ %tmp3 = bitcast <8 x i16> %tmp2 to <16 x i8>
+ %tmp4 = load <8 x i16>* %__b, align 16
+ %tmp5 = bitcast <8 x i16> %tmp4 to <16 x i8>
+ %tmp6 = bitcast <16 x i8> %tmp3 to <8 x i16>
+ %tmp7 = bitcast <16 x i8> %tmp5 to <8 x i16>
+ %vext = shufflevector <8 x i16> %tmp6, <8 x i16> %tmp7, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+ store <8 x i16> %vext, <8 x i16>* %xS16x8, align 16
+ ret void
+}
+
+define void @test_vextq_u16() nounwind ssp {
+ ; CHECK-LABEL: test_vextq_u16:
+ ; CHECK: {{ext.16.*#8}}
+ %xU16x8 = alloca <8 x i16>, align 16
+ %__a = alloca <8 x i16>, align 16
+ %__b = alloca <8 x i16>, align 16
+ %tmp = load <8 x i16>* %xU16x8, align 16
+ store <8 x i16> %tmp, <8 x i16>* %__a, align 16
+ %tmp1 = load <8 x i16>* %xU16x8, align 16
+ store <8 x i16> %tmp1, <8 x i16>* %__b, align 16
+ %tmp2 = load <8 x i16>* %__a, align 16
+ %tmp3 = bitcast <8 x i16> %tmp2 to <16 x i8>
+ %tmp4 = load <8 x i16>* %__b, align 16
+ %tmp5 = bitcast <8 x i16> %tmp4 to <16 x i8>
+ %tmp6 = bitcast <16 x i8> %tmp3 to <8 x i16>
+ %tmp7 = bitcast <16 x i8> %tmp5 to <8 x i16>
+ %vext = shufflevector <8 x i16> %tmp6, <8 x i16> %tmp7, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ store <8 x i16> %vext, <8 x i16>* %xU16x8, align 16
+ ret void
+}
+
+define void @test_vextq_p16() nounwind ssp {
+ ; CHECK-LABEL: test_vextq_p16:
+ ; CHECK: {{ext.16.*#10}}
+ %xP16x8 = alloca <8 x i16>, align 16
+ %__a = alloca <8 x i16>, align 16
+ %__b = alloca <8 x i16>, align 16
+ %tmp = load <8 x i16>* %xP16x8, align 16
+ store <8 x i16> %tmp, <8 x i16>* %__a, align 16
+ %tmp1 = load <8 x i16>* %xP16x8, align 16
+ store <8 x i16> %tmp1, <8 x i16>* %__b, align 16
+ %tmp2 = load <8 x i16>* %__a, align 16
+ %tmp3 = bitcast <8 x i16> %tmp2 to <16 x i8>
+ %tmp4 = load <8 x i16>* %__b, align 16
+ %tmp5 = bitcast <8 x i16> %tmp4 to <16 x i8>
+ %tmp6 = bitcast <16 x i8> %tmp3 to <8 x i16>
+ %tmp7 = bitcast <16 x i8> %tmp5 to <8 x i16>
+ %vext = shufflevector <8 x i16> %tmp6, <8 x i16> %tmp7, <8 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12>
+ store <8 x i16> %vext, <8 x i16>* %xP16x8, align 16
+ ret void
+}
+
+define void @test_vextq_s32() nounwind ssp {
+ ; CHECK-LABEL: test_vextq_s32:
+ ; CHECK: {{ext.16.*#4}}
+ %xS32x4 = alloca <4 x i32>, align 16
+ %__a = alloca <4 x i32>, align 16
+ %__b = alloca <4 x i32>, align 16
+ %tmp = load <4 x i32>* %xS32x4, align 16
+ store <4 x i32> %tmp, <4 x i32>* %__a, align 16
+ %tmp1 = load <4 x i32>* %xS32x4, align 16
+ store <4 x i32> %tmp1, <4 x i32>* %__b, align 16
+ %tmp2 = load <4 x i32>* %__a, align 16
+ %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
+ %tmp4 = load <4 x i32>* %__b, align 16
+ %tmp5 = bitcast <4 x i32> %tmp4 to <16 x i8>
+ %tmp6 = bitcast <16 x i8> %tmp3 to <4 x i32>
+ %tmp7 = bitcast <16 x i8> %tmp5 to <4 x i32>
+ %vext = shufflevector <4 x i32> %tmp6, <4 x i32> %tmp7, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+ store <4 x i32> %vext, <4 x i32>* %xS32x4, align 16
+ ret void
+}
+
+define void @test_vextq_u32() nounwind ssp {
+ ; CHECK-LABEL: test_vextq_u32:
+ ; CHECK: {{ext.16.*#8}}
+ %xU32x4 = alloca <4 x i32>, align 16
+ %__a = alloca <4 x i32>, align 16
+ %__b = alloca <4 x i32>, align 16
+ %tmp = load <4 x i32>* %xU32x4, align 16
+ store <4 x i32> %tmp, <4 x i32>* %__a, align 16
+ %tmp1 = load <4 x i32>* %xU32x4, align 16
+ store <4 x i32> %tmp1, <4 x i32>* %__b, align 16
+ %tmp2 = load <4 x i32>* %__a, align 16
+ %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
+ %tmp4 = load <4 x i32>* %__b, align 16
+ %tmp5 = bitcast <4 x i32> %tmp4 to <16 x i8>
+ %tmp6 = bitcast <16 x i8> %tmp3 to <4 x i32>
+ %tmp7 = bitcast <16 x i8> %tmp5 to <4 x i32>
+ %vext = shufflevector <4 x i32> %tmp6, <4 x i32> %tmp7, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ store <4 x i32> %vext, <4 x i32>* %xU32x4, align 16
+ ret void
+}
+
+define void @test_vextq_f32() nounwind ssp {
+ ; CHECK-LABEL: test_vextq_f32:
+ ; CHECK: {{ext.16.*#12}}
+ %xF32x4 = alloca <4 x float>, align 16
+ %__a = alloca <4 x float>, align 16
+ %__b = alloca <4 x float>, align 16
+ %tmp = load <4 x float>* %xF32x4, align 16
+ store <4 x float> %tmp, <4 x float>* %__a, align 16
+ %tmp1 = load <4 x float>* %xF32x4, align 16
+ store <4 x float> %tmp1, <4 x float>* %__b, align 16
+ %tmp2 = load <4 x float>* %__a, align 16
+ %tmp3 = bitcast <4 x float> %tmp2 to <16 x i8>
+ %tmp4 = load <4 x float>* %__b, align 16
+ %tmp5 = bitcast <4 x float> %tmp4 to <16 x i8>
+ %tmp6 = bitcast <16 x i8> %tmp3 to <4 x float>
+ %tmp7 = bitcast <16 x i8> %tmp5 to <4 x float>
+ %vext = shufflevector <4 x float> %tmp6, <4 x float> %tmp7, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+ store <4 x float> %vext, <4 x float>* %xF32x4, align 16
+ ret void
+}
+
+define void @test_vextq_s64() nounwind ssp {
+ ; CHECK-LABEL: test_vextq_s64:
+ ; CHECK: {{ext.16.*#8}}
+ %xS64x2 = alloca <2 x i64>, align 16
+ %__a = alloca <2 x i64>, align 16
+ %__b = alloca <2 x i64>, align 16
+ %tmp = load <2 x i64>* %xS64x2, align 16
+ store <2 x i64> %tmp, <2 x i64>* %__a, align 16
+ %tmp1 = load <2 x i64>* %xS64x2, align 16
+ store <2 x i64> %tmp1, <2 x i64>* %__b, align 16
+ %tmp2 = load <2 x i64>* %__a, align 16
+ %tmp3 = bitcast <2 x i64> %tmp2 to <16 x i8>
+ %tmp4 = load <2 x i64>* %__b, align 16
+ %tmp5 = bitcast <2 x i64> %tmp4 to <16 x i8>
+ %tmp6 = bitcast <16 x i8> %tmp3 to <2 x i64>
+ %tmp7 = bitcast <16 x i8> %tmp5 to <2 x i64>
+ %vext = shufflevector <2 x i64> %tmp6, <2 x i64> %tmp7, <2 x i32> <i32 1, i32 2>
+ store <2 x i64> %vext, <2 x i64>* %xS64x2, align 16
+ ret void
+}
+
+define void @test_vextq_u64() nounwind ssp {
+ ; CHECK-LABEL: test_vextq_u64:
+ ; CHECK: {{ext.16.*#8}}
+ %xU64x2 = alloca <2 x i64>, align 16
+ %__a = alloca <2 x i64>, align 16
+ %__b = alloca <2 x i64>, align 16
+ %tmp = load <2 x i64>* %xU64x2, align 16
+ store <2 x i64> %tmp, <2 x i64>* %__a, align 16
+ %tmp1 = load <2 x i64>* %xU64x2, align 16
+ store <2 x i64> %tmp1, <2 x i64>* %__b, align 16
+ %tmp2 = load <2 x i64>* %__a, align 16
+ %tmp3 = bitcast <2 x i64> %tmp2 to <16 x i8>
+ %tmp4 = load <2 x i64>* %__b, align 16
+ %tmp5 = bitcast <2 x i64> %tmp4 to <16 x i8>
+ %tmp6 = bitcast <16 x i8> %tmp3 to <2 x i64>
+ %tmp7 = bitcast <16 x i8> %tmp5 to <2 x i64>
+ %vext = shufflevector <2 x i64> %tmp6, <2 x i64> %tmp7, <2 x i32> <i32 1, i32 2>
+ store <2 x i64> %vext, <2 x i64>* %xU64x2, align 16
+ ret void
+}
+
+; shuffles with an undef second operand can use an EXT also so long as the
+; indices wrap and stay sequential.
+; rdar://12051674
+define <16 x i8> @vext1(<16 x i8> %_a) nounwind {
+; CHECK-LABEL: vext1:
+; CHECK: ext.16b v0, v0, v0, #8
+ %vext = shufflevector <16 x i8> %_a, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <16 x i8> %vext
+}
+
+; <rdar://problem/12212062>
+define <2 x i64> @vext2(<2 x i64> %p0, <2 x i64> %p1) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: vext2:
+; CHECK: ext.16b v1, v1, v1, #8
+; CHECK: ext.16b v0, v0, v0, #8
+; CHECK: add.2d v0, v0, v1
+ %t0 = shufflevector <2 x i64> %p1, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
+ %t1 = shufflevector <2 x i64> %p0, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
+ %t2 = add <2 x i64> %t1, %t0
+ ret <2 x i64> %t2
+}
diff --git a/test/CodeGen/AArch64/arm64-vext_reverse.ll b/test/CodeGen/AArch64/arm64-vext_reverse.ll
new file mode 100644
index 0000000..c45e55e
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vext_reverse.ll
@@ -0,0 +1,172 @@
+; RUN: llc -mtriple=arm64-linux-gnuabi < %s | FileCheck %s
+
+; The following tests is to check the correctness of reversing input operand
+; of vext by enumerating all cases of using two undefs in shuffle masks.
+
+define <4 x i16> @vext_6701_0(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_6701_0:
+; CHECK: ext v0.8b, v1.8b, v0.8b, #4
+ %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+ ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_6701_12(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_6701_12:
+; CHECK: ext v0.8b, v0.8b, v0.8b, #4
+ %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+ ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_6701_13(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_6701_13:
+; CHECK: ext v0.8b, v1.8b, v0.8b, #4
+ %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 7, i32 undef, i32 1>
+ ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_6701_14(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_6701_14:
+; CHECK: ext v0.8b, v1.8b, v0.8b, #4
+ %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 7, i32 0, i32 undef>
+ ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_6701_23(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_6701_23:
+; CHECK: ext v0.8b, v1.8b, v0.8b, #4
+ %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 6, i32 undef, i32 undef, i32 1>
+ ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_6701_24(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_6701_24:
+; CHECK: ext v0.8b, v1.8b, v0.8b, #4
+ %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 6, i32 undef, i32 0, i32 undef>
+ ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_6701_34(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_6701_34:
+; CHECK: ext v0.8b, v1.8b, v0.8b, #4
+ %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 6, i32 7, i32 undef, i32 undef>
+ ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_5670_0(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_5670_0:
+; CHECK: ext v0.8b, v1.8b, v0.8b, #2
+ %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 5, i32 6, i32 7, i32 0>
+ ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_5670_12(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_5670_12:
+; CHECK: ext v0.8b, v1.8b, v0.8b, #2
+ %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 undef, i32 7, i32 0>
+ ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_5670_13(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_5670_13:
+; CHECK: ext v0.8b, v1.8b, v0.8b, #2
+ %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 6, i32 undef, i32 0>
+ ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_5670_14(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_5670_14:
+; CHECK: ext v0.8b, v1.8b, v0.8b, #2
+ %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 6, i32 7, i32 undef>
+ ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_5670_23(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_5670_23:
+; CHECK: ext v0.8b, v1.8b, v0.8b, #2
+ %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 5, i32 undef, i32 undef, i32 0>
+ ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_5670_24(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_5670_24:
+; CHECK: rev32 v0.4h, v1.4h
+ %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 5, i32 undef, i32 7, i32 undef>
+ ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_5670_34(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_5670_34:
+; CHECK: ext v0.8b, v1.8b, v0.8b, #2
+ %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 5, i32 6, i32 undef, i32 undef>
+ ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_7012_0(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_7012_0:
+; CHECK: ext v0.8b, v1.8b, v0.8b, #6
+ %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 7, i32 0, i32 1, i32 2>
+ ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_7012_12(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_7012_12:
+; CHECK: ext v0.8b, v0.8b, v0.8b, #6
+ %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 undef, i32 1, i32 2>
+ ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_7012_13(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_7012_13:
+; CHECK: rev32 v0.4h, v0.4h
+ %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 0, i32 undef, i32 2>
+ ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_7012_14(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_7012_14:
+; CHECK: ext v0.8b, v0.8b, v0.8b, #6
+ %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 undef, i32 0, i32 1, i32 undef>
+ ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_7012_23(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_7012_23:
+; CHECK: ext v0.8b, v1.8b, v0.8b, #6
+ %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 7, i32 undef, i32 undef, i32 2>
+ ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_7012_24(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_7012_24:
+; CHECK: ext v0.8b, v1.8b, v0.8b, #6
+ %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 7, i32 undef, i32 1, i32 undef>
+ ret <4 x i16> %x
+}
+
+define <4 x i16> @vext_7012_34(<4 x i16> %a1, <4 x i16> %a2) {
+entry:
+; CHECK-LABEL: vext_7012_34:
+; CHECK: ext v0.8b, v1.8b, v0.8b, #6
+ %x = shufflevector <4 x i16> %a1, <4 x i16> %a2, <4 x i32> <i32 7, i32 0, i32 undef, i32 undef>
+ ret <4 x i16> %x
+}
diff --git a/test/CodeGen/AArch64/arm64-vfloatintrinsics.ll b/test/CodeGen/AArch64/arm64-vfloatintrinsics.ll
new file mode 100644
index 0000000..255a182
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vfloatintrinsics.ll
@@ -0,0 +1,375 @@
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s
+
+;;; Float vectors
+
+%v2f32 = type <2 x float>
+; CHECK: test_v2f32.sqrt:
+define %v2f32 @test_v2f32.sqrt(%v2f32 %a) {
+ ; CHECK: fsqrt.2s
+ %1 = call %v2f32 @llvm.sqrt.v2f32(%v2f32 %a)
+ ret %v2f32 %1
+}
+; CHECK: test_v2f32.powi:
+define %v2f32 @test_v2f32.powi(%v2f32 %a, i32 %b) {
+ ; CHECK: pow
+ %1 = call %v2f32 @llvm.powi.v2f32(%v2f32 %a, i32 %b)
+ ret %v2f32 %1
+}
+; CHECK: test_v2f32.sin:
+define %v2f32 @test_v2f32.sin(%v2f32 %a) {
+ ; CHECK: sin
+ %1 = call %v2f32 @llvm.sin.v2f32(%v2f32 %a)
+ ret %v2f32 %1
+}
+; CHECK: test_v2f32.cos:
+define %v2f32 @test_v2f32.cos(%v2f32 %a) {
+ ; CHECK: cos
+ %1 = call %v2f32 @llvm.cos.v2f32(%v2f32 %a)
+ ret %v2f32 %1
+}
+; CHECK: test_v2f32.pow:
+define %v2f32 @test_v2f32.pow(%v2f32 %a, %v2f32 %b) {
+ ; CHECK: pow
+ %1 = call %v2f32 @llvm.pow.v2f32(%v2f32 %a, %v2f32 %b)
+ ret %v2f32 %1
+}
+; CHECK: test_v2f32.exp:
+define %v2f32 @test_v2f32.exp(%v2f32 %a) {
+ ; CHECK: exp
+ %1 = call %v2f32 @llvm.exp.v2f32(%v2f32 %a)
+ ret %v2f32 %1
+}
+; CHECK: test_v2f32.exp2:
+define %v2f32 @test_v2f32.exp2(%v2f32 %a) {
+ ; CHECK: exp
+ %1 = call %v2f32 @llvm.exp2.v2f32(%v2f32 %a)
+ ret %v2f32 %1
+}
+; CHECK: test_v2f32.log:
+define %v2f32 @test_v2f32.log(%v2f32 %a) {
+ ; CHECK: log
+ %1 = call %v2f32 @llvm.log.v2f32(%v2f32 %a)
+ ret %v2f32 %1
+}
+; CHECK: test_v2f32.log10:
+define %v2f32 @test_v2f32.log10(%v2f32 %a) {
+ ; CHECK: log
+ %1 = call %v2f32 @llvm.log10.v2f32(%v2f32 %a)
+ ret %v2f32 %1
+}
+; CHECK: test_v2f32.log2:
+define %v2f32 @test_v2f32.log2(%v2f32 %a) {
+ ; CHECK: log
+ %1 = call %v2f32 @llvm.log2.v2f32(%v2f32 %a)
+ ret %v2f32 %1
+}
+; CHECK: test_v2f32.fma:
+define %v2f32 @test_v2f32.fma(%v2f32 %a, %v2f32 %b, %v2f32 %c) {
+ ; CHECK: fma
+ %1 = call %v2f32 @llvm.fma.v2f32(%v2f32 %a, %v2f32 %b, %v2f32 %c)
+ ret %v2f32 %1
+}
+; CHECK: test_v2f32.fabs:
+define %v2f32 @test_v2f32.fabs(%v2f32 %a) {
+ ; CHECK: fabs
+ %1 = call %v2f32 @llvm.fabs.v2f32(%v2f32 %a)
+ ret %v2f32 %1
+}
+; CHECK: test_v2f32.floor:
+define %v2f32 @test_v2f32.floor(%v2f32 %a) {
+ ; CHECK: frintm.2s
+ %1 = call %v2f32 @llvm.floor.v2f32(%v2f32 %a)
+ ret %v2f32 %1
+}
+; CHECK: test_v2f32.ceil:
+define %v2f32 @test_v2f32.ceil(%v2f32 %a) {
+ ; CHECK: frintp.2s
+ %1 = call %v2f32 @llvm.ceil.v2f32(%v2f32 %a)
+ ret %v2f32 %1
+}
+; CHECK: test_v2f32.trunc:
+define %v2f32 @test_v2f32.trunc(%v2f32 %a) {
+ ; CHECK: frintz.2s
+ %1 = call %v2f32 @llvm.trunc.v2f32(%v2f32 %a)
+ ret %v2f32 %1
+}
+; CHECK: test_v2f32.rint:
+define %v2f32 @test_v2f32.rint(%v2f32 %a) {
+ ; CHECK: frintx.2s
+ %1 = call %v2f32 @llvm.rint.v2f32(%v2f32 %a)
+ ret %v2f32 %1
+}
+; CHECK: test_v2f32.nearbyint:
+define %v2f32 @test_v2f32.nearbyint(%v2f32 %a) {
+ ; CHECK: frinti.2s
+ %1 = call %v2f32 @llvm.nearbyint.v2f32(%v2f32 %a)
+ ret %v2f32 %1
+}
+
+declare %v2f32 @llvm.sqrt.v2f32(%v2f32) #0
+declare %v2f32 @llvm.powi.v2f32(%v2f32, i32) #0
+declare %v2f32 @llvm.sin.v2f32(%v2f32) #0
+declare %v2f32 @llvm.cos.v2f32(%v2f32) #0
+declare %v2f32 @llvm.pow.v2f32(%v2f32, %v2f32) #0
+declare %v2f32 @llvm.exp.v2f32(%v2f32) #0
+declare %v2f32 @llvm.exp2.v2f32(%v2f32) #0
+declare %v2f32 @llvm.log.v2f32(%v2f32) #0
+declare %v2f32 @llvm.log10.v2f32(%v2f32) #0
+declare %v2f32 @llvm.log2.v2f32(%v2f32) #0
+declare %v2f32 @llvm.fma.v2f32(%v2f32, %v2f32, %v2f32) #0
+declare %v2f32 @llvm.fabs.v2f32(%v2f32) #0
+declare %v2f32 @llvm.floor.v2f32(%v2f32) #0
+declare %v2f32 @llvm.ceil.v2f32(%v2f32) #0
+declare %v2f32 @llvm.trunc.v2f32(%v2f32) #0
+declare %v2f32 @llvm.rint.v2f32(%v2f32) #0
+declare %v2f32 @llvm.nearbyint.v2f32(%v2f32) #0
+
+;;;
+
+%v4f32 = type <4 x float>
+; CHECK: test_v4f32.sqrt:
+define %v4f32 @test_v4f32.sqrt(%v4f32 %a) {
+ ; CHECK: fsqrt.4s
+ %1 = call %v4f32 @llvm.sqrt.v4f32(%v4f32 %a)
+ ret %v4f32 %1
+}
+; CHECK: test_v4f32.powi:
+define %v4f32 @test_v4f32.powi(%v4f32 %a, i32 %b) {
+ ; CHECK: pow
+ %1 = call %v4f32 @llvm.powi.v4f32(%v4f32 %a, i32 %b)
+ ret %v4f32 %1
+}
+; CHECK: test_v4f32.sin:
+define %v4f32 @test_v4f32.sin(%v4f32 %a) {
+ ; CHECK: sin
+ %1 = call %v4f32 @llvm.sin.v4f32(%v4f32 %a)
+ ret %v4f32 %1
+}
+; CHECK: test_v4f32.cos:
+define %v4f32 @test_v4f32.cos(%v4f32 %a) {
+ ; CHECK: cos
+ %1 = call %v4f32 @llvm.cos.v4f32(%v4f32 %a)
+ ret %v4f32 %1
+}
+; CHECK: test_v4f32.pow:
+define %v4f32 @test_v4f32.pow(%v4f32 %a, %v4f32 %b) {
+ ; CHECK: pow
+ %1 = call %v4f32 @llvm.pow.v4f32(%v4f32 %a, %v4f32 %b)
+ ret %v4f32 %1
+}
+; CHECK: test_v4f32.exp:
+define %v4f32 @test_v4f32.exp(%v4f32 %a) {
+ ; CHECK: exp
+ %1 = call %v4f32 @llvm.exp.v4f32(%v4f32 %a)
+ ret %v4f32 %1
+}
+; CHECK: test_v4f32.exp2:
+define %v4f32 @test_v4f32.exp2(%v4f32 %a) {
+ ; CHECK: exp
+ %1 = call %v4f32 @llvm.exp2.v4f32(%v4f32 %a)
+ ret %v4f32 %1
+}
+; CHECK: test_v4f32.log:
+define %v4f32 @test_v4f32.log(%v4f32 %a) {
+ ; CHECK: log
+ %1 = call %v4f32 @llvm.log.v4f32(%v4f32 %a)
+ ret %v4f32 %1
+}
+; CHECK: test_v4f32.log10:
+define %v4f32 @test_v4f32.log10(%v4f32 %a) {
+ ; CHECK: log
+ %1 = call %v4f32 @llvm.log10.v4f32(%v4f32 %a)
+ ret %v4f32 %1
+}
+; CHECK: test_v4f32.log2:
+define %v4f32 @test_v4f32.log2(%v4f32 %a) {
+ ; CHECK: log
+ %1 = call %v4f32 @llvm.log2.v4f32(%v4f32 %a)
+ ret %v4f32 %1
+}
+; CHECK: test_v4f32.fma:
+define %v4f32 @test_v4f32.fma(%v4f32 %a, %v4f32 %b, %v4f32 %c) {
+ ; CHECK: fma
+ %1 = call %v4f32 @llvm.fma.v4f32(%v4f32 %a, %v4f32 %b, %v4f32 %c)
+ ret %v4f32 %1
+}
+; CHECK: test_v4f32.fabs:
+define %v4f32 @test_v4f32.fabs(%v4f32 %a) {
+ ; CHECK: fabs
+ %1 = call %v4f32 @llvm.fabs.v4f32(%v4f32 %a)
+ ret %v4f32 %1
+}
+; CHECK: test_v4f32.floor:
+define %v4f32 @test_v4f32.floor(%v4f32 %a) {
+ ; CHECK: frintm.4s
+ %1 = call %v4f32 @llvm.floor.v4f32(%v4f32 %a)
+ ret %v4f32 %1
+}
+; CHECK: test_v4f32.ceil:
+define %v4f32 @test_v4f32.ceil(%v4f32 %a) {
+ ; CHECK: frintp.4s
+ %1 = call %v4f32 @llvm.ceil.v4f32(%v4f32 %a)
+ ret %v4f32 %1
+}
+; CHECK: test_v4f32.trunc:
+define %v4f32 @test_v4f32.trunc(%v4f32 %a) {
+ ; CHECK: frintz.4s
+ %1 = call %v4f32 @llvm.trunc.v4f32(%v4f32 %a)
+ ret %v4f32 %1
+}
+; CHECK: test_v4f32.rint:
+define %v4f32 @test_v4f32.rint(%v4f32 %a) {
+ ; CHECK: frintx.4s
+ %1 = call %v4f32 @llvm.rint.v4f32(%v4f32 %a)
+ ret %v4f32 %1
+}
+; CHECK: test_v4f32.nearbyint:
+define %v4f32 @test_v4f32.nearbyint(%v4f32 %a) {
+ ; CHECK: frinti.4s
+ %1 = call %v4f32 @llvm.nearbyint.v4f32(%v4f32 %a)
+ ret %v4f32 %1
+}
+
+declare %v4f32 @llvm.sqrt.v4f32(%v4f32) #0
+declare %v4f32 @llvm.powi.v4f32(%v4f32, i32) #0
+declare %v4f32 @llvm.sin.v4f32(%v4f32) #0
+declare %v4f32 @llvm.cos.v4f32(%v4f32) #0
+declare %v4f32 @llvm.pow.v4f32(%v4f32, %v4f32) #0
+declare %v4f32 @llvm.exp.v4f32(%v4f32) #0
+declare %v4f32 @llvm.exp2.v4f32(%v4f32) #0
+declare %v4f32 @llvm.log.v4f32(%v4f32) #0
+declare %v4f32 @llvm.log10.v4f32(%v4f32) #0
+declare %v4f32 @llvm.log2.v4f32(%v4f32) #0
+declare %v4f32 @llvm.fma.v4f32(%v4f32, %v4f32, %v4f32) #0
+declare %v4f32 @llvm.fabs.v4f32(%v4f32) #0
+declare %v4f32 @llvm.floor.v4f32(%v4f32) #0
+declare %v4f32 @llvm.ceil.v4f32(%v4f32) #0
+declare %v4f32 @llvm.trunc.v4f32(%v4f32) #0
+declare %v4f32 @llvm.rint.v4f32(%v4f32) #0
+declare %v4f32 @llvm.nearbyint.v4f32(%v4f32) #0
+
+;;; Double vector
+
+%v2f64 = type <2 x double>
+; CHECK: test_v2f64.sqrt:
+define %v2f64 @test_v2f64.sqrt(%v2f64 %a) {
+ ; CHECK: fsqrt.2d
+ %1 = call %v2f64 @llvm.sqrt.v2f64(%v2f64 %a)
+ ret %v2f64 %1
+}
+; CHECK: test_v2f64.powi:
+define %v2f64 @test_v2f64.powi(%v2f64 %a, i32 %b) {
+ ; CHECK: pow
+ %1 = call %v2f64 @llvm.powi.v2f64(%v2f64 %a, i32 %b)
+ ret %v2f64 %1
+}
+; CHECK: test_v2f64.sin:
+define %v2f64 @test_v2f64.sin(%v2f64 %a) {
+ ; CHECK: sin
+ %1 = call %v2f64 @llvm.sin.v2f64(%v2f64 %a)
+ ret %v2f64 %1
+}
+; CHECK: test_v2f64.cos:
+define %v2f64 @test_v2f64.cos(%v2f64 %a) {
+ ; CHECK: cos
+ %1 = call %v2f64 @llvm.cos.v2f64(%v2f64 %a)
+ ret %v2f64 %1
+}
+; CHECK: test_v2f64.pow:
+define %v2f64 @test_v2f64.pow(%v2f64 %a, %v2f64 %b) {
+ ; CHECK: pow
+ %1 = call %v2f64 @llvm.pow.v2f64(%v2f64 %a, %v2f64 %b)
+ ret %v2f64 %1
+}
+; CHECK: test_v2f64.exp:
+define %v2f64 @test_v2f64.exp(%v2f64 %a) {
+ ; CHECK: exp
+ %1 = call %v2f64 @llvm.exp.v2f64(%v2f64 %a)
+ ret %v2f64 %1
+}
+; CHECK: test_v2f64.exp2:
+define %v2f64 @test_v2f64.exp2(%v2f64 %a) {
+ ; CHECK: exp
+ %1 = call %v2f64 @llvm.exp2.v2f64(%v2f64 %a)
+ ret %v2f64 %1
+}
+; CHECK: test_v2f64.log:
+define %v2f64 @test_v2f64.log(%v2f64 %a) {
+ ; CHECK: log
+ %1 = call %v2f64 @llvm.log.v2f64(%v2f64 %a)
+ ret %v2f64 %1
+}
+; CHECK: test_v2f64.log10:
+define %v2f64 @test_v2f64.log10(%v2f64 %a) {
+ ; CHECK: log
+ %1 = call %v2f64 @llvm.log10.v2f64(%v2f64 %a)
+ ret %v2f64 %1
+}
+; CHECK: test_v2f64.log2:
+define %v2f64 @test_v2f64.log2(%v2f64 %a) {
+ ; CHECK: log
+ %1 = call %v2f64 @llvm.log2.v2f64(%v2f64 %a)
+ ret %v2f64 %1
+}
+; CHECK: test_v2f64.fma:
+define %v2f64 @test_v2f64.fma(%v2f64 %a, %v2f64 %b, %v2f64 %c) {
+ ; CHECK: fma
+ %1 = call %v2f64 @llvm.fma.v2f64(%v2f64 %a, %v2f64 %b, %v2f64 %c)
+ ret %v2f64 %1
+}
+; CHECK: test_v2f64.fabs:
+define %v2f64 @test_v2f64.fabs(%v2f64 %a) {
+ ; CHECK: fabs
+ %1 = call %v2f64 @llvm.fabs.v2f64(%v2f64 %a)
+ ret %v2f64 %1
+}
+; CHECK: test_v2f64.floor:
+define %v2f64 @test_v2f64.floor(%v2f64 %a) {
+ ; CHECK: frintm.2d
+ %1 = call %v2f64 @llvm.floor.v2f64(%v2f64 %a)
+ ret %v2f64 %1
+}
+; CHECK: test_v2f64.ceil:
+define %v2f64 @test_v2f64.ceil(%v2f64 %a) {
+ ; CHECK: frintp.2d
+ %1 = call %v2f64 @llvm.ceil.v2f64(%v2f64 %a)
+ ret %v2f64 %1
+}
+; CHECK: test_v2f64.trunc:
+define %v2f64 @test_v2f64.trunc(%v2f64 %a) {
+ ; CHECK: frintz.2d
+ %1 = call %v2f64 @llvm.trunc.v2f64(%v2f64 %a)
+ ret %v2f64 %1
+}
+; CHECK: test_v2f64.rint:
+define %v2f64 @test_v2f64.rint(%v2f64 %a) {
+ ; CHECK: frintx.2d
+ %1 = call %v2f64 @llvm.rint.v2f64(%v2f64 %a)
+ ret %v2f64 %1
+}
+; CHECK: test_v2f64.nearbyint:
+define %v2f64 @test_v2f64.nearbyint(%v2f64 %a) {
+ ; CHECK: frinti.2d
+ %1 = call %v2f64 @llvm.nearbyint.v2f64(%v2f64 %a)
+ ret %v2f64 %1
+}
+
+declare %v2f64 @llvm.sqrt.v2f64(%v2f64) #0
+declare %v2f64 @llvm.powi.v2f64(%v2f64, i32) #0
+declare %v2f64 @llvm.sin.v2f64(%v2f64) #0
+declare %v2f64 @llvm.cos.v2f64(%v2f64) #0
+declare %v2f64 @llvm.pow.v2f64(%v2f64, %v2f64) #0
+declare %v2f64 @llvm.exp.v2f64(%v2f64) #0
+declare %v2f64 @llvm.exp2.v2f64(%v2f64) #0
+declare %v2f64 @llvm.log.v2f64(%v2f64) #0
+declare %v2f64 @llvm.log10.v2f64(%v2f64) #0
+declare %v2f64 @llvm.log2.v2f64(%v2f64) #0
+declare %v2f64 @llvm.fma.v2f64(%v2f64, %v2f64, %v2f64) #0
+declare %v2f64 @llvm.fabs.v2f64(%v2f64) #0
+declare %v2f64 @llvm.floor.v2f64(%v2f64) #0
+declare %v2f64 @llvm.ceil.v2f64(%v2f64) #0
+declare %v2f64 @llvm.trunc.v2f64(%v2f64) #0
+declare %v2f64 @llvm.rint.v2f64(%v2f64) #0
+declare %v2f64 @llvm.nearbyint.v2f64(%v2f64) #0
+
+attributes #0 = { nounwind readonly }
diff --git a/test/CodeGen/AArch64/arm64-vhadd.ll b/test/CodeGen/AArch64/arm64-vhadd.ll
new file mode 100644
index 0000000..6178bf9
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vhadd.ll
@@ -0,0 +1,249 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @shadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: shadd8b:
+;CHECK: shadd.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @shadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: shadd16b:
+;CHECK: shadd.16b
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.shadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @shadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: shadd4h:
+;CHECK: shadd.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @shadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: shadd8h:
+;CHECK: shadd.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @shadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: shadd2s:
+;CHECK: shadd.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @shadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: shadd4s:
+;CHECK: shadd.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define <8 x i8> @uhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uhadd8b:
+;CHECK: uhadd.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @uhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uhadd16b:
+;CHECK: uhadd.16b
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.uhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @uhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uhadd4h:
+;CHECK: uhadd.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.uhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @uhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uhadd8h:
+;CHECK: uhadd.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @uhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uhadd2s:
+;CHECK: uhadd.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.uhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @uhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uhadd4s:
+;CHECK: uhadd.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.shadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.uhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i8> @srhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: srhadd8b:
+;CHECK: srhadd.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @srhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: srhadd16b:
+;CHECK: srhadd.16b
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.srhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @srhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: srhadd4h:
+;CHECK: srhadd.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @srhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: srhadd8h:
+;CHECK: srhadd.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @srhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: srhadd2s:
+;CHECK: srhadd.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @srhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: srhadd4s:
+;CHECK: srhadd.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define <8 x i8> @urhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: urhadd8b:
+;CHECK: urhadd.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @urhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: urhadd16b:
+;CHECK: urhadd.16b
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @urhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: urhadd4h:
+;CHECK: urhadd.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.urhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @urhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: urhadd8h:
+;CHECK: urhadd.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @urhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: urhadd2s:
+;CHECK: urhadd.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.urhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @urhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: urhadd4s:
+;CHECK: urhadd.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.urhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.urhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.srhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-vhsub.ll b/test/CodeGen/AArch64/arm64-vhsub.ll
new file mode 100644
index 0000000..13bfda3
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vhsub.ll
@@ -0,0 +1,125 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @shsub8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: shsub8b:
+;CHECK: shsub.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.shsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @shsub16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: shsub16b:
+;CHECK: shsub.16b
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.shsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @shsub4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: shsub4h:
+;CHECK: shsub.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.shsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @shsub8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: shsub8h:
+;CHECK: shsub.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.shsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @shsub2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: shsub2s:
+;CHECK: shsub.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.shsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @shsub4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: shsub4s:
+;CHECK: shsub.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.shsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define <8 x i8> @uhsub8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uhsub8b:
+;CHECK: uhsub.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.uhsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @uhsub16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uhsub16b:
+;CHECK: uhsub.16b
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.uhsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @uhsub4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uhsub4h:
+;CHECK: uhsub.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.uhsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @uhsub8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uhsub8h:
+;CHECK: uhsub.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.uhsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @uhsub2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uhsub2s:
+;CHECK: uhsub.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.uhsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @uhsub4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uhsub4s:
+;CHECK: uhsub.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.uhsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.shsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.shsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.shsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8> @llvm.aarch64.neon.uhsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uhsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uhsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.shsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.shsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.shsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.uhsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.uhsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uhsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-virtual_base.ll b/test/CodeGen/AArch64/arm64-virtual_base.ll
new file mode 100644
index 0000000..cb95954
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-virtual_base.ll
@@ -0,0 +1,51 @@
+; RUN: llc < %s -O3 -march arm64 | FileCheck %s
+; <rdar://13463602>
+
+%struct.Counter_Struct = type { i64, i64 }
+%struct.Bicubic_Patch_Struct = type { %struct.Method_Struct*, i32, %struct.Object_Struct*, %struct.Texture_Struct*, %struct.Interior_Struct*, %struct.Object_Struct*, %struct.Object_Struct*, %struct.Bounding_Box_Struct, i64, i32, i32, i32, [4 x [4 x [3 x double]]], [3 x double], double, double, %struct.Bezier_Node_Struct* }
+%struct.Method_Struct = type { i32 (%struct.Object_Struct*, %struct.Ray_Struct*, %struct.istack_struct*)*, i32 (double*, %struct.Object_Struct*)*, void (double*, %struct.Object_Struct*, %struct.istk_entry*)*, i8* (%struct.Object_Struct*)*, void (%struct.Object_Struct*, double*, %struct.Transform_Struct*)*, void (%struct.Object_Struct*, double*, %struct.Transform_Struct*)*, void (%struct.Object_Struct*, double*, %struct.Transform_Struct*)*, void (%struct.Object_Struct*, %struct.Transform_Struct*)*, void (%struct.Object_Struct*)*, void (%struct.Object_Struct*)* }
+%struct.Object_Struct = type { %struct.Method_Struct*, i32, %struct.Object_Struct*, %struct.Texture_Struct*, %struct.Interior_Struct*, %struct.Object_Struct*, %struct.Object_Struct*, %struct.Bounding_Box_Struct, i64 }
+%struct.Texture_Struct = type { i16, i16, i16, i32, float, float, float, %struct.Warps_Struct*, %struct.Pattern_Struct*, %struct.Blend_Map_Struct*, %union.anon.9, %struct.Texture_Struct*, %struct.Pigment_Struct*, %struct.Tnormal_Struct*, %struct.Finish_Struct*, %struct.Texture_Struct*, i32 }
+%struct.Warps_Struct = type { i16, %struct.Warps_Struct* }
+%struct.Pattern_Struct = type { i16, i16, i16, i32, float, float, float, %struct.Warps_Struct*, %struct.Pattern_Struct*, %struct.Blend_Map_Struct*, %union.anon.6 }
+%struct.Blend_Map_Struct = type { i16, i16, i16, i64, %struct.Blend_Map_Entry* }
+%struct.Blend_Map_Entry = type { float, i8, %union.anon }
+%union.anon = type { [2 x double], [8 x i8] }
+%union.anon.6 = type { %struct.anon.7 }
+%struct.anon.7 = type { float, [3 x double] }
+%union.anon.9 = type { %struct.anon.10 }
+%struct.anon.10 = type { float, [3 x double] }
+%struct.Pigment_Struct = type { i16, i16, i16, i32, float, float, float, %struct.Warps_Struct*, %struct.Pattern_Struct*, %struct.Blend_Map_Struct*, %union.anon.0, [5 x float] }
+%union.anon.0 = type { %struct.anon }
+%struct.anon = type { float, [3 x double] }
+%struct.Tnormal_Struct = type { i16, i16, i16, i32, float, float, float, %struct.Warps_Struct*, %struct.Pattern_Struct*, %struct.Blend_Map_Struct*, %union.anon.3, float }
+%union.anon.3 = type { %struct.anon.4 }
+%struct.anon.4 = type { float, [3 x double] }
+%struct.Finish_Struct = type { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, [3 x float], [3 x float] }
+%struct.Interior_Struct = type { i64, i32, float, float, float, float, float, %struct.Media_Struct* }
+%struct.Media_Struct = type { i32, i32, i32, i32, i32, double, double, i32, i32, i32, i32, [5 x float], [5 x float], [5 x float], [5 x float], double, double, double, double*, %struct.Pigment_Struct*, %struct.Media_Struct* }
+%struct.Bounding_Box_Struct = type { [3 x float], [3 x float] }
+%struct.Ray_Struct = type { [3 x double], [3 x double], i32, [100 x %struct.Interior_Struct*] }
+%struct.istack_struct = type { %struct.istack_struct*, %struct.istk_entry*, i32 }
+%struct.istk_entry = type { double, [3 x double], [3 x double], %struct.Object_Struct*, i32, i32, double, double, i8* }
+%struct.Transform_Struct = type { [4 x [4 x double]], [4 x [4 x double]] }
+%struct.Bezier_Node_Struct = type { i32, [3 x double], double, i32, i8* }
+
+define void @Precompute_Patch_Values(%struct.Bicubic_Patch_Struct* %Shape) {
+; CHECK: Precompute_Patch_Values
+; CHECK: ldr [[VAL:x[0-9]+]], [x0, #288]
+; CHECK-NEXT: str [[VAL]], [sp, #232]
+; CHECK-NEXT: ldr [[VAL2:q[0-9]+]], [x0, #272]
+; CHECK-NEXT: stur [[VAL2]], {{\[}}sp, #216]
+entry:
+ %Control_Points = alloca [16 x [3 x double]], align 8
+ %arraydecay5.3.1 = getelementptr inbounds [16 x [3 x double]]* %Control_Points, i64 0, i64 9, i64 0
+ %tmp14 = bitcast double* %arraydecay5.3.1 to i8*
+ %arraydecay11.3.1 = getelementptr inbounds %struct.Bicubic_Patch_Struct* %Shape, i64 0, i32 12, i64 1, i64 3, i64 0
+ %tmp15 = bitcast double* %arraydecay11.3.1 to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* %tmp14, i8* %tmp15, i64 24, i32 1, i1 false)
+ ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1)
diff --git a/test/CodeGen/AArch64/arm64-vmax.ll b/test/CodeGen/AArch64/arm64-vmax.ll
new file mode 100644
index 0000000..3f2c134
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vmax.ll
@@ -0,0 +1,679 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @smax_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: smax_8b:
+;CHECK: smax.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.smax.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @smax_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: smax_16b:
+;CHECK: smax.16b
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.smax.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @smax_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: smax_4h:
+;CHECK: smax.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.smax.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @smax_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: smax_8h:
+;CHECK: smax.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.smax.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @smax_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: smax_2s:
+;CHECK: smax.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @smax_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: smax_4s:
+;CHECK: smax.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.smax.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.smax.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.smax.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.smax.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i8> @umax_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: umax_8b:
+;CHECK: umax.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.umax.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @umax_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: umax_16b:
+;CHECK: umax.16b
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.umax.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @umax_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: umax_4h:
+;CHECK: umax.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.umax.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @umax_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: umax_8h:
+;CHECK: umax.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.umax.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @umax_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: umax_2s:
+;CHECK: umax.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @umax_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: umax_4s:
+;CHECK: umax.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.umax.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.umax.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.umax.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.umax.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i8> @smin_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: smin_8b:
+;CHECK: smin.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.smin.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @smin_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: smin_16b:
+;CHECK: smin.16b
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.smin.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @smin_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: smin_4h:
+;CHECK: smin.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.smin.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @smin_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: smin_8h:
+;CHECK: smin.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.smin.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @smin_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: smin_2s:
+;CHECK: smin.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @smin_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: smin_4s:
+;CHECK: smin.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.smin.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.smin.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.smin.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.smin.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i8> @umin_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: umin_8b:
+;CHECK: umin.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.umin.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @umin_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: umin_16b:
+;CHECK: umin.16b
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.umin.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @umin_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: umin_4h:
+;CHECK: umin.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.umin.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @umin_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: umin_8h:
+;CHECK: umin.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.umin.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @umin_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: umin_2s:
+;CHECK: umin.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @umin_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: umin_4s:
+;CHECK: umin.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.umin.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.umin.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.umin.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.umin.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @smaxp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: smaxp_8b:
+;CHECK: smaxp.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.smaxp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @smaxp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: smaxp_16b:
+;CHECK: smaxp.16b
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.smaxp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @smaxp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: smaxp_4h:
+;CHECK: smaxp.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.smaxp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @smaxp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: smaxp_8h:
+;CHECK: smaxp.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.smaxp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @smaxp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: smaxp_2s:
+;CHECK: smaxp.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.smaxp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @smaxp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: smaxp_4s:
+;CHECK: smaxp.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.smaxp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.smaxp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.smaxp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.smaxp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.smaxp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.smaxp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.smaxp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i8> @umaxp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: umaxp_8b:
+;CHECK: umaxp.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.umaxp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @umaxp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: umaxp_16b:
+;CHECK: umaxp.16b
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.umaxp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @umaxp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: umaxp_4h:
+;CHECK: umaxp.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.umaxp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @umaxp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: umaxp_8h:
+;CHECK: umaxp.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.umaxp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @umaxp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: umaxp_2s:
+;CHECK: umaxp.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.umaxp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @umaxp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: umaxp_4s:
+;CHECK: umaxp.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.umaxp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.umaxp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.umaxp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.umaxp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.umaxp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.umaxp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.umaxp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @sminp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: sminp_8b:
+;CHECK: sminp.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.sminp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @sminp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: sminp_16b:
+;CHECK: sminp.16b
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.sminp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @sminp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sminp_4h:
+;CHECK: sminp.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.sminp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @sminp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sminp_8h:
+;CHECK: sminp.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.sminp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @sminp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sminp_2s:
+;CHECK: sminp.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.sminp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @sminp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sminp_4s:
+;CHECK: sminp.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.sminp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.sminp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.sminp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sminp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sminp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sminp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sminp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i8> @uminp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uminp_8b:
+;CHECK: uminp.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.uminp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @uminp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uminp_16b:
+;CHECK: uminp.16b
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.uminp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @uminp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uminp_4h:
+;CHECK: uminp.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.uminp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @uminp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uminp_8h:
+;CHECK: uminp.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.uminp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @uminp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uminp_2s:
+;CHECK: uminp.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.uminp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @uminp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uminp_4s:
+;CHECK: uminp.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.uminp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.uminp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.uminp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uminp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.uminp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uminp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uminp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x float> @fmax_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fmax_2s:
+;CHECK: fmax.2s
+ %tmp1 = load <2 x float>* %A
+ %tmp2 = load <2 x float>* %B
+ %tmp3 = call <2 x float> @llvm.aarch64.neon.fmax.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+ ret <2 x float> %tmp3
+}
+
+define <4 x float> @fmax_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fmax_4s:
+;CHECK: fmax.4s
+ %tmp1 = load <4 x float>* %A
+ %tmp2 = load <4 x float>* %B
+ %tmp3 = call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+ ret <4 x float> %tmp3
+}
+
+define <2 x double> @fmax_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fmax_2d:
+;CHECK: fmax.2d
+ %tmp1 = load <2 x double>* %A
+ %tmp2 = load <2 x double>* %B
+ %tmp3 = call <2 x double> @llvm.aarch64.neon.fmax.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+ ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.fmax.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fmax.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x float> @fmaxp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fmaxp_2s:
+;CHECK: fmaxp.2s
+ %tmp1 = load <2 x float>* %A
+ %tmp2 = load <2 x float>* %B
+ %tmp3 = call <2 x float> @llvm.aarch64.neon.fmaxp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+ ret <2 x float> %tmp3
+}
+
+define <4 x float> @fmaxp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fmaxp_4s:
+;CHECK: fmaxp.4s
+ %tmp1 = load <4 x float>* %A
+ %tmp2 = load <4 x float>* %B
+ %tmp3 = call <4 x float> @llvm.aarch64.neon.fmaxp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+ ret <4 x float> %tmp3
+}
+
+define <2 x double> @fmaxp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fmaxp_2d:
+;CHECK: fmaxp.2d
+ %tmp1 = load <2 x double>* %A
+ %tmp2 = load <2 x double>* %B
+ %tmp3 = call <2 x double> @llvm.aarch64.neon.fmaxp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+ ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.fmaxp.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fmaxp.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fmaxp.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x float> @fmin_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fmin_2s:
+;CHECK: fmin.2s
+ %tmp1 = load <2 x float>* %A
+ %tmp2 = load <2 x float>* %B
+ %tmp3 = call <2 x float> @llvm.aarch64.neon.fmin.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+ ret <2 x float> %tmp3
+}
+
+define <4 x float> @fmin_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fmin_4s:
+;CHECK: fmin.4s
+ %tmp1 = load <4 x float>* %A
+ %tmp2 = load <4 x float>* %B
+ %tmp3 = call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+ ret <4 x float> %tmp3
+}
+
+define <2 x double> @fmin_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fmin_2d:
+;CHECK: fmin.2d
+ %tmp1 = load <2 x double>* %A
+ %tmp2 = load <2 x double>* %B
+ %tmp3 = call <2 x double> @llvm.aarch64.neon.fmin.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+ ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.fmin.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fmin.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x float> @fminp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fminp_2s:
+;CHECK: fminp.2s
+ %tmp1 = load <2 x float>* %A
+ %tmp2 = load <2 x float>* %B
+ %tmp3 = call <2 x float> @llvm.aarch64.neon.fminp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+ ret <2 x float> %tmp3
+}
+
+define <4 x float> @fminp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fminp_4s:
+;CHECK: fminp.4s
+ %tmp1 = load <4 x float>* %A
+ %tmp2 = load <4 x float>* %B
+ %tmp3 = call <4 x float> @llvm.aarch64.neon.fminp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+ ret <4 x float> %tmp3
+}
+
+define <2 x double> @fminp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fminp_2d:
+;CHECK: fminp.2d
+ %tmp1 = load <2 x double>* %A
+ %tmp2 = load <2 x double>* %B
+ %tmp3 = call <2 x double> @llvm.aarch64.neon.fminp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+ ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.fminp.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fminp.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fminp.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x float> @fminnmp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fminnmp_2s:
+;CHECK: fminnmp.2s
+ %tmp1 = load <2 x float>* %A
+ %tmp2 = load <2 x float>* %B
+ %tmp3 = call <2 x float> @llvm.aarch64.neon.fminnmp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+ ret <2 x float> %tmp3
+}
+
+define <4 x float> @fminnmp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fminnmp_4s:
+;CHECK: fminnmp.4s
+ %tmp1 = load <4 x float>* %A
+ %tmp2 = load <4 x float>* %B
+ %tmp3 = call <4 x float> @llvm.aarch64.neon.fminnmp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+ ret <4 x float> %tmp3
+}
+
+define <2 x double> @fminnmp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fminnmp_2d:
+;CHECK: fminnmp.2d
+ %tmp1 = load <2 x double>* %A
+ %tmp2 = load <2 x double>* %B
+ %tmp3 = call <2 x double> @llvm.aarch64.neon.fminnmp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+ ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.fminnmp.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fminnmp.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fminnmp.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x float> @fmaxnmp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fmaxnmp_2s:
+;CHECK: fmaxnmp.2s
+ %tmp1 = load <2 x float>* %A
+ %tmp2 = load <2 x float>* %B
+ %tmp3 = call <2 x float> @llvm.aarch64.neon.fmaxnmp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+ ret <2 x float> %tmp3
+}
+
+define <4 x float> @fmaxnmp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fmaxnmp_4s:
+;CHECK: fmaxnmp.4s
+ %tmp1 = load <4 x float>* %A
+ %tmp2 = load <4 x float>* %B
+ %tmp3 = call <4 x float> @llvm.aarch64.neon.fmaxnmp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+ ret <4 x float> %tmp3
+}
+
+define <2 x double> @fmaxnmp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fmaxnmp_2d:
+;CHECK: fmaxnmp.2d
+ %tmp1 = load <2 x double>* %A
+ %tmp2 = load <2 x double>* %B
+ %tmp3 = call <2 x double> @llvm.aarch64.neon.fmaxnmp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+ ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.fmaxnmp.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fmaxnmp.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fmaxnmp.v2f64(<2 x double>, <2 x double>) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-vminmaxnm.ll b/test/CodeGen/AArch64/arm64-vminmaxnm.ll
new file mode 100644
index 0000000..b5aca45
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vminmaxnm.ll
@@ -0,0 +1,68 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <2 x float> @f1(<2 x float> %a, <2 x float> %b) nounwind readnone ssp {
+; CHECK: fmaxnm.2s v0, v0, v1
+; CHECK: ret
+ %vmaxnm2.i = tail call <2 x float> @llvm.aarch64.neon.fmaxnm.v2f32(<2 x float> %a, <2 x float> %b) nounwind
+ ret <2 x float> %vmaxnm2.i
+}
+
+define <4 x float> @f2(<4 x float> %a, <4 x float> %b) nounwind readnone ssp {
+; CHECK: fmaxnm.4s v0, v0, v1
+; CHECK: ret
+ %vmaxnm2.i = tail call <4 x float> @llvm.aarch64.neon.fmaxnm.v4f32(<4 x float> %a, <4 x float> %b) nounwind
+ ret <4 x float> %vmaxnm2.i
+}
+
+define <2 x double> @f3(<2 x double> %a, <2 x double> %b) nounwind readnone ssp {
+; CHECK: fmaxnm.2d v0, v0, v1
+; CHECK: ret
+ %vmaxnm2.i = tail call <2 x double> @llvm.aarch64.neon.fmaxnm.v2f64(<2 x double> %a, <2 x double> %b) nounwind
+ ret <2 x double> %vmaxnm2.i
+}
+
+define <2 x float> @f4(<2 x float> %a, <2 x float> %b) nounwind readnone ssp {
+; CHECK: fminnm.2s v0, v0, v1
+; CHECK: ret
+ %vminnm2.i = tail call <2 x float> @llvm.aarch64.neon.fminnm.v2f32(<2 x float> %a, <2 x float> %b) nounwind
+ ret <2 x float> %vminnm2.i
+}
+
+define <4 x float> @f5(<4 x float> %a, <4 x float> %b) nounwind readnone ssp {
+; CHECK: fminnm.4s v0, v0, v1
+; CHECK: ret
+ %vminnm2.i = tail call <4 x float> @llvm.aarch64.neon.fminnm.v4f32(<4 x float> %a, <4 x float> %b) nounwind
+ ret <4 x float> %vminnm2.i
+}
+
+define <2 x double> @f6(<2 x double> %a, <2 x double> %b) nounwind readnone ssp {
+; CHECK: fminnm.2d v0, v0, v1
+; CHECK: ret
+ %vminnm2.i = tail call <2 x double> @llvm.aarch64.neon.fminnm.v2f64(<2 x double> %a, <2 x double> %b) nounwind
+ ret <2 x double> %vminnm2.i
+}
+
+declare <2 x double> @llvm.aarch64.neon.fminnm.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fminnm.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.fminnm.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fmaxnm.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fmaxnm.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.fmaxnm.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+
+define double @test_fmaxnmv(<2 x double> %in) {
+; CHECK-LABEL: test_fmaxnmv:
+; CHECK: fmaxnmp.2d d0, v0
+ %max = call double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double> %in)
+ ret double %max
+}
+
+define double @test_fminnmv(<2 x double> %in) {
+; CHECK-LABEL: test_fminnmv:
+; CHECK: fminnmp.2d d0, v0
+ %min = call double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double> %in)
+ ret double %min
+}
+
+declare double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double>)
+declare double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double>)
diff --git a/test/CodeGen/AArch64/arm64-vmovn.ll b/test/CodeGen/AArch64/arm64-vmovn.ll
new file mode 100644
index 0000000..67e2816
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vmovn.ll
@@ -0,0 +1,242 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @xtn8b(<8 x i16> %A) nounwind {
+;CHECK-LABEL: xtn8b:
+;CHECK-NOT: ld1
+;CHECK: xtn.8b v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = trunc <8 x i16> %A to <8 x i8>
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @xtn4h(<4 x i32> %A) nounwind {
+;CHECK-LABEL: xtn4h:
+;CHECK-NOT: ld1
+;CHECK: xtn.4h v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = trunc <4 x i32> %A to <4 x i16>
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @xtn2s(<2 x i64> %A) nounwind {
+;CHECK-LABEL: xtn2s:
+;CHECK-NOT: ld1
+;CHECK: xtn.2s v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = trunc <2 x i64> %A to <2 x i32>
+ ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @xtn2_16b(<8 x i8> %ret, <8 x i16> %A) nounwind {
+;CHECK-LABEL: xtn2_16b:
+;CHECK-NOT: ld1
+;CHECK: xtn2.16b v0, v1
+;CHECK-NEXT: ret
+ %tmp3 = trunc <8 x i16> %A to <8 x i8>
+ %res = shufflevector <8 x i8> %ret, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %res
+}
+
+define <8 x i16> @xtn2_8h(<4 x i16> %ret, <4 x i32> %A) nounwind {
+;CHECK-LABEL: xtn2_8h:
+;CHECK-NOT: ld1
+;CHECK: xtn2.8h v0, v1
+;CHECK-NEXT: ret
+ %tmp3 = trunc <4 x i32> %A to <4 x i16>
+ %res = shufflevector <4 x i16> %ret, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i16> %res
+}
+
+define <4 x i32> @xtn2_4s(<2 x i32> %ret, <2 x i64> %A) nounwind {
+;CHECK-LABEL: xtn2_4s:
+;CHECK-NOT: ld1
+;CHECK: xtn2.4s v0, v1
+;CHECK-NEXT: ret
+ %tmp3 = trunc <2 x i64> %A to <2 x i32>
+ %res = shufflevector <2 x i32> %ret, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x i32> %res
+}
+
+define <8 x i8> @sqxtn8b(<8 x i16> %A) nounwind {
+;CHECK-LABEL: sqxtn8b:
+;CHECK-NOT: ld1
+;CHECK: sqxtn.8b v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> %A)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqxtn4h(<4 x i32> %A) nounwind {
+;CHECK-LABEL: sqxtn4h:
+;CHECK-NOT: ld1
+;CHECK: sqxtn.4h v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> %A)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqxtn2s(<2 x i64> %A) nounwind {
+;CHECK-LABEL: sqxtn2s:
+;CHECK-NOT: ld1
+;CHECK: sqxtn.2s v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> %A)
+ ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqxtn2_16b(<8 x i8> %ret, <8 x i16> %A) nounwind {
+;CHECK-LABEL: sqxtn2_16b:
+;CHECK-NOT: ld1
+;CHECK: sqxtn2.16b v0, v1
+;CHECK-NEXT: ret
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> %A)
+ %res = shufflevector <8 x i8> %ret, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %res
+}
+
+define <8 x i16> @sqxtn2_8h(<4 x i16> %ret, <4 x i32> %A) nounwind {
+;CHECK-LABEL: sqxtn2_8h:
+;CHECK-NOT: ld1
+;CHECK: sqxtn2.8h v0, v1
+;CHECK-NEXT: ret
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> %A)
+ %res = shufflevector <4 x i16> %ret, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i16> %res
+}
+
+define <4 x i32> @sqxtn2_4s(<2 x i32> %ret, <2 x i64> %A) nounwind {
+;CHECK-LABEL: sqxtn2_4s:
+;CHECK-NOT: ld1
+;CHECK: sqxtn2.4s v0, v1
+;CHECK-NEXT: ret
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> %A)
+ %res = shufflevector <2 x i32> %ret, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x i32> %res
+}
+
+declare <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64>) nounwind readnone
+
+define <8 x i8> @uqxtn8b(<8 x i16> %A) nounwind {
+;CHECK-LABEL: uqxtn8b:
+;CHECK-NOT: ld1
+;CHECK: uqxtn.8b v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> %A)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @uqxtn4h(<4 x i32> %A) nounwind {
+;CHECK-LABEL: uqxtn4h:
+;CHECK-NOT: ld1
+;CHECK: uqxtn.4h v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> %A)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uqxtn2s(<2 x i64> %A) nounwind {
+;CHECK-LABEL: uqxtn2s:
+;CHECK-NOT: ld1
+;CHECK: uqxtn.2s v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> %A)
+ ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @uqxtn2_16b(<8 x i8> %ret, <8 x i16> %A) nounwind {
+;CHECK-LABEL: uqxtn2_16b:
+;CHECK-NOT: ld1
+;CHECK: uqxtn2.16b v0, v1
+;CHECK-NEXT: ret
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> %A)
+ %res = shufflevector <8 x i8> %ret, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %res
+}
+
+define <8 x i16> @uqxtn2_8h(<4 x i16> %ret, <4 x i32> %A) nounwind {
+;CHECK-LABEL: uqxtn2_8h:
+;CHECK-NOT: ld1
+;CHECK: uqxtn2.8h v0, v1
+;CHECK-NEXT: ret
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> %A)
+ %res = shufflevector <4 x i16> %ret, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i16> %res
+}
+
+define <4 x i32> @uqxtn2_4s(<2 x i32> %ret, <2 x i64> %A) nounwind {
+;CHECK-LABEL: uqxtn2_4s:
+;CHECK-NOT: ld1
+;CHECK: uqxtn2.4s v0, v1
+;CHECK-NEXT: ret
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> %A)
+ %res = shufflevector <2 x i32> %ret, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x i32> %res
+}
+
+declare <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64>) nounwind readnone
+
+define <8 x i8> @sqxtun8b(<8 x i16> %A) nounwind {
+;CHECK-LABEL: sqxtun8b:
+;CHECK-NOT: ld1
+;CHECK: sqxtun.8b v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> %A)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqxtun4h(<4 x i32> %A) nounwind {
+;CHECK-LABEL: sqxtun4h:
+;CHECK-NOT: ld1
+;CHECK: sqxtun.4h v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> %A)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqxtun2s(<2 x i64> %A) nounwind {
+;CHECK-LABEL: sqxtun2s:
+;CHECK-NOT: ld1
+;CHECK: sqxtun.2s v0, v0
+;CHECK-NEXT: ret
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> %A)
+ ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqxtun2_16b(<8 x i8> %ret, <8 x i16> %A) nounwind {
+;CHECK-LABEL: sqxtun2_16b:
+;CHECK-NOT: ld1
+;CHECK: sqxtun2.16b v0, v1
+;CHECK-NEXT: ret
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> %A)
+ %res = shufflevector <8 x i8> %ret, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %res
+}
+
+define <8 x i16> @sqxtun2_8h(<4 x i16> %ret, <4 x i32> %A) nounwind {
+;CHECK-LABEL: sqxtun2_8h:
+;CHECK-NOT: ld1
+;CHECK: sqxtun2.8h v0, v1
+;CHECK-NEXT: ret
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> %A)
+ %res = shufflevector <4 x i16> %ret, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i16> %res
+}
+
+define <4 x i32> @sqxtun2_4s(<2 x i32> %ret, <2 x i64> %A) nounwind {
+;CHECK-LABEL: sqxtun2_4s:
+;CHECK-NOT: ld1
+;CHECK: sqxtun2.4s v0, v1
+;CHECK-NEXT: ret
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> %A)
+ %res = shufflevector <2 x i32> %ret, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x i32> %res
+}
+
+declare <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64>) nounwind readnone
+
diff --git a/test/CodeGen/AArch64/arm64-vmul.ll b/test/CodeGen/AArch64/arm64-vmul.ll
new file mode 100644
index 0000000..6fa60fe
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vmul.ll
@@ -0,0 +1,2036 @@
+; RUN: llc -asm-verbose=false < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+
+define <8 x i16> @smull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: smull8h:
+;CHECK: smull.8h
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @smull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: smull4s:
+;CHECK: smull.4s
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @smull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: smull2d:
+;CHECK: smull.2d
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i64> %tmp3
+}
+
+declare <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+
+define <8 x i16> @umull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: umull8h:
+;CHECK: umull.8h
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @umull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: umull4s:
+;CHECK: umull.4s
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @umull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: umull2d:
+;CHECK: umull.2d
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i64> %tmp3
+}
+
+declare <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+
+define <4 x i32> @sqdmull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqdmull4s:
+;CHECK: sqdmull.4s
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sqdmull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqdmull2d:
+;CHECK: sqdmull.2d
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i64> %tmp3
+}
+
+define <4 x i32> @sqdmull2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqdmull2_4s:
+;CHECK: sqdmull2.4s
+ %load1 = load <8 x i16>* %A
+ %load2 = load <8 x i16>* %B
+ %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sqdmull2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqdmull2_2d:
+;CHECK: sqdmull2.2d
+ %load1 = load <4 x i32>* %A
+ %load2 = load <4 x i32>* %B
+ %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i64> %tmp3
+}
+
+
+declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+
+define <8 x i16> @pmull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: pmull8h:
+;CHECK: pmull.8h
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+
+define <4 x i16> @sqdmulh_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqdmulh_4h:
+;CHECK: sqdmulh.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @sqdmulh_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqdmulh_8h:
+;CHECK: sqdmulh.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @sqdmulh_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqdmulh_2s:
+;CHECK: sqdmulh.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @sqdmulh_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqdmulh_4s:
+;CHECK: sqdmulh.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define i32 @sqdmulh_1s(i32* %A, i32* %B) nounwind {
+;CHECK-LABEL: sqdmulh_1s:
+;CHECK: sqdmulh s0, {{s[0-9]+}}, {{s[0-9]+}}
+ %tmp1 = load i32* %A
+ %tmp2 = load i32* %B
+ %tmp3 = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %tmp1, i32 %tmp2)
+ ret i32 %tmp3
+}
+
+declare <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare i32 @llvm.aarch64.neon.sqdmulh.i32(i32, i32) nounwind readnone
+
+define <4 x i16> @sqrdmulh_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqrdmulh_4h:
+;CHECK: sqrdmulh.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @sqrdmulh_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqrdmulh_8h:
+;CHECK: sqrdmulh.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @sqrdmulh_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqrdmulh_2s:
+;CHECK: sqrdmulh.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @sqrdmulh_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqrdmulh_4s:
+;CHECK: sqrdmulh.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define i32 @sqrdmulh_1s(i32* %A, i32* %B) nounwind {
+;CHECK-LABEL: sqrdmulh_1s:
+;CHECK: sqrdmulh s0, {{s[0-9]+}}, {{s[0-9]+}}
+ %tmp1 = load i32* %A
+ %tmp2 = load i32* %B
+ %tmp3 = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %tmp1, i32 %tmp2)
+ ret i32 %tmp3
+}
+
+declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare i32 @llvm.aarch64.neon.sqrdmulh.i32(i32, i32) nounwind readnone
+
+define <2 x float> @fmulx_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fmulx_2s:
+;CHECK: fmulx.2s
+ %tmp1 = load <2 x float>* %A
+ %tmp2 = load <2 x float>* %B
+ %tmp3 = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+ ret <2 x float> %tmp3
+}
+
+define <4 x float> @fmulx_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fmulx_4s:
+;CHECK: fmulx.4s
+ %tmp1 = load <4 x float>* %A
+ %tmp2 = load <4 x float>* %B
+ %tmp3 = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+ ret <4 x float> %tmp3
+}
+
+define <2 x double> @fmulx_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fmulx_2d:
+;CHECK: fmulx.2d
+ %tmp1 = load <2 x double>* %A
+ %tmp2 = load <2 x double>* %B
+ %tmp3 = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+ ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <4 x i32> @smlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: smlal4s:
+;CHECK: smlal.4s
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = load <4 x i32>* %C
+ %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ %tmp5 = add <4 x i32> %tmp3, %tmp4
+ ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @smlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: smlal2d:
+;CHECK: smlal.2d
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = load <2 x i64>* %C
+ %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ %tmp5 = add <2 x i64> %tmp3, %tmp4
+ ret <2 x i64> %tmp5
+}
+
+define <4 x i32> @smlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: smlsl4s:
+;CHECK: smlsl.4s
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = load <4 x i32>* %C
+ %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ %tmp5 = sub <4 x i32> %tmp3, %tmp4
+ ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @smlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: smlsl2d:
+;CHECK: smlsl.2d
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = load <2 x i64>* %C
+ %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ %tmp5 = sub <2 x i64> %tmp3, %tmp4
+ ret <2 x i64> %tmp5
+}
+
+declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
+declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
+
+define <4 x i32> @sqdmlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sqdmlal4s:
+;CHECK: sqdmlal.4s
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = load <4 x i32>* %C
+ %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
+ ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @sqdmlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sqdmlal2d:
+;CHECK: sqdmlal.2d
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = load <2 x i64>* %C
+ %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
+ ret <2 x i64> %tmp5
+}
+
+define <4 x i32> @sqdmlal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sqdmlal2_4s:
+;CHECK: sqdmlal2.4s
+ %load1 = load <8 x i16>* %A
+ %load2 = load <8 x i16>* %B
+ %tmp3 = load <4 x i32>* %C
+ %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
+ ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @sqdmlal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sqdmlal2_2d:
+;CHECK: sqdmlal2.2d
+ %load1 = load <4 x i32>* %A
+ %load2 = load <4 x i32>* %B
+ %tmp3 = load <2 x i64>* %C
+ %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
+ ret <2 x i64> %tmp5
+}
+
+define <4 x i32> @sqdmlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sqdmlsl4s:
+;CHECK: sqdmlsl.4s
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = load <4 x i32>* %C
+ %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
+ ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @sqdmlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sqdmlsl2d:
+;CHECK: sqdmlsl.2d
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = load <2 x i64>* %C
+ %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
+ ret <2 x i64> %tmp5
+}
+
+define <4 x i32> @sqdmlsl2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sqdmlsl2_4s:
+;CHECK: sqdmlsl2.4s
+ %load1 = load <8 x i16>* %A
+ %load2 = load <8 x i16>* %B
+ %tmp3 = load <4 x i32>* %C
+ %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
+ ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @sqdmlsl2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sqdmlsl2_2d:
+;CHECK: sqdmlsl2.2d
+ %load1 = load <4 x i32>* %A
+ %load2 = load <4 x i32>* %B
+ %tmp3 = load <2 x i64>* %C
+ %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
+ ret <2 x i64> %tmp5
+}
+
+define <4 x i32> @umlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: umlal4s:
+;CHECK: umlal.4s
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = load <4 x i32>* %C
+ %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ %tmp5 = add <4 x i32> %tmp3, %tmp4
+ ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @umlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: umlal2d:
+;CHECK: umlal.2d
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = load <2 x i64>* %C
+ %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ %tmp5 = add <2 x i64> %tmp3, %tmp4
+ ret <2 x i64> %tmp5
+}
+
+define <4 x i32> @umlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: umlsl4s:
+;CHECK: umlsl.4s
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = load <4 x i32>* %C
+ %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ %tmp5 = sub <4 x i32> %tmp3, %tmp4
+ ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @umlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: umlsl2d:
+;CHECK: umlsl.2d
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = load <2 x i64>* %C
+ %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ %tmp5 = sub <2 x i64> %tmp3, %tmp4
+ ret <2 x i64> %tmp5
+}
+
+define <2 x float> @fmla_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
+;CHECK-LABEL: fmla_2s:
+;CHECK: fmla.2s
+ %tmp1 = load <2 x float>* %A
+ %tmp2 = load <2 x float>* %B
+ %tmp3 = load <2 x float>* %C
+ %tmp4 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp1, <2 x float> %tmp2, <2 x float> %tmp3)
+ ret <2 x float> %tmp4
+}
+
+define <4 x float> @fmla_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
+;CHECK-LABEL: fmla_4s:
+;CHECK: fmla.4s
+ %tmp1 = load <4 x float>* %A
+ %tmp2 = load <4 x float>* %B
+ %tmp3 = load <4 x float>* %C
+ %tmp4 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp1, <4 x float> %tmp2, <4 x float> %tmp3)
+ ret <4 x float> %tmp4
+}
+
+define <2 x double> @fmla_2d(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind {
+;CHECK-LABEL: fmla_2d:
+;CHECK: fmla.2d
+ %tmp1 = load <2 x double>* %A
+ %tmp2 = load <2 x double>* %B
+ %tmp3 = load <2 x double>* %C
+ %tmp4 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp1, <2 x double> %tmp2, <2 x double> %tmp3)
+ ret <2 x double> %tmp4
+}
+
+declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+
+define <2 x float> @fmls_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
+;CHECK-LABEL: fmls_2s:
+;CHECK: fmls.2s
+ %tmp1 = load <2 x float>* %A
+ %tmp2 = load <2 x float>* %B
+ %tmp3 = load <2 x float>* %C
+ %tmp4 = fsub <2 x float> <float -0.0, float -0.0>, %tmp2
+ %tmp5 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp1, <2 x float> %tmp4, <2 x float> %tmp3)
+ ret <2 x float> %tmp5
+}
+
+define <4 x float> @fmls_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
+;CHECK-LABEL: fmls_4s:
+;CHECK: fmls.4s
+ %tmp1 = load <4 x float>* %A
+ %tmp2 = load <4 x float>* %B
+ %tmp3 = load <4 x float>* %C
+ %tmp4 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %tmp2
+ %tmp5 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp1, <4 x float> %tmp4, <4 x float> %tmp3)
+ ret <4 x float> %tmp5
+}
+
+define <2 x double> @fmls_2d(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind {
+;CHECK-LABEL: fmls_2d:
+;CHECK: fmls.2d
+ %tmp1 = load <2 x double>* %A
+ %tmp2 = load <2 x double>* %B
+ %tmp3 = load <2 x double>* %C
+ %tmp4 = fsub <2 x double> <double -0.0, double -0.0>, %tmp2
+ %tmp5 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp1, <2 x double> %tmp4, <2 x double> %tmp3)
+ ret <2 x double> %tmp5
+}
+
+define <2 x float> @fmls_commuted_neg_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
+;CHECK-LABEL: fmls_commuted_neg_2s:
+;CHECK: fmls.2s
+ %tmp1 = load <2 x float>* %A
+ %tmp2 = load <2 x float>* %B
+ %tmp3 = load <2 x float>* %C
+ %tmp4 = fsub <2 x float> <float -0.0, float -0.0>, %tmp2
+ %tmp5 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp4, <2 x float> %tmp1, <2 x float> %tmp3)
+ ret <2 x float> %tmp5
+}
+
+define <4 x float> @fmls_commuted_neg_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
+;CHECK-LABEL: fmls_commuted_neg_4s:
+;CHECK: fmls.4s
+ %tmp1 = load <4 x float>* %A
+ %tmp2 = load <4 x float>* %B
+ %tmp3 = load <4 x float>* %C
+ %tmp4 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %tmp2
+ %tmp5 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp4, <4 x float> %tmp1, <4 x float> %tmp3)
+ ret <4 x float> %tmp5
+}
+
+define <2 x double> @fmls_commuted_neg_2d(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind {
+;CHECK-LABEL: fmls_commuted_neg_2d:
+;CHECK: fmls.2d
+ %tmp1 = load <2 x double>* %A
+ %tmp2 = load <2 x double>* %B
+ %tmp3 = load <2 x double>* %C
+ %tmp4 = fsub <2 x double> <double -0.0, double -0.0>, %tmp2
+ %tmp5 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp4, <2 x double> %tmp1, <2 x double> %tmp3)
+ ret <2 x double> %tmp5
+}
+
+define <2 x float> @fmls_indexed_2s(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone ssp {
+;CHECK-LABEL: fmls_indexed_2s:
+;CHECK: fmls.2s
+entry:
+ %0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %c
+ %lane = shufflevector <2 x float> %b, <2 x float> undef, <2 x i32> zeroinitializer
+ %fmls1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %0, <2 x float> %lane, <2 x float> %a)
+ ret <2 x float> %fmls1
+}
+
+define <4 x float> @fmls_indexed_4s(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone ssp {
+;CHECK-LABEL: fmls_indexed_4s:
+;CHECK: fmls.4s
+entry:
+ %0 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
+ %lane = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer
+ %fmls1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %0, <4 x float> %lane, <4 x float> %a)
+ ret <4 x float> %fmls1
+}
+
+define <2 x double> @fmls_indexed_2d(<2 x double> %a, <2 x double> %b, <2 x double> %c) nounwind readnone ssp {
+;CHECK-LABEL: fmls_indexed_2d:
+;CHECK: fmls.2d
+entry:
+ %0 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %c
+ %lane = shufflevector <2 x double> %b, <2 x double> undef, <2 x i32> zeroinitializer
+ %fmls1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %0, <2 x double> %lane, <2 x double> %a)
+ ret <2 x double> %fmls1
+}
+
+define <2 x float> @fmla_indexed_scalar_2s(<2 x float> %a, <2 x float> %b, float %c) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: fmla_indexed_scalar_2s:
+; CHECK-NEXT: fmla.2s
+; CHECK-NEXT: ret
+ %v1 = insertelement <2 x float> undef, float %c, i32 0
+ %v2 = insertelement <2 x float> %v1, float %c, i32 1
+ %fmla1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %v1, <2 x float> %b, <2 x float> %a) nounwind
+ ret <2 x float> %fmla1
+}
+
+define <4 x float> @fmla_indexed_scalar_4s(<4 x float> %a, <4 x float> %b, float %c) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: fmla_indexed_scalar_4s:
+; CHECK-NEXT: fmla.4s
+; CHECK-NEXT: ret
+ %v1 = insertelement <4 x float> undef, float %c, i32 0
+ %v2 = insertelement <4 x float> %v1, float %c, i32 1
+ %v3 = insertelement <4 x float> %v2, float %c, i32 2
+ %v4 = insertelement <4 x float> %v3, float %c, i32 3
+ %fmla1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %v4, <4 x float> %b, <4 x float> %a) nounwind
+ ret <4 x float> %fmla1
+}
+
+define <2 x double> @fmla_indexed_scalar_2d(<2 x double> %a, <2 x double> %b, double %c) nounwind readnone ssp {
+; CHECK-LABEL: fmla_indexed_scalar_2d:
+; CHECK-NEXT: fmla.2d
+; CHECK-NEXT: ret
+entry:
+ %v1 = insertelement <2 x double> undef, double %c, i32 0
+ %v2 = insertelement <2 x double> %v1, double %c, i32 1
+ %fmla1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %v2, <2 x double> %b, <2 x double> %a) nounwind
+ ret <2 x double> %fmla1
+}
+
+define <4 x i16> @mul_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: mul_4h:
+;CHECK-NOT: dup
+;CHECK: mul.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %tmp4 = mul <4 x i16> %tmp1, %tmp3
+ ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @mul_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: mul_8h:
+;CHECK-NOT: dup
+;CHECK: mul.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %tmp4 = mul <8 x i16> %tmp1, %tmp3
+ ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @mul_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: mul_2s:
+;CHECK-NOT: dup
+;CHECK: mul.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+ %tmp4 = mul <2 x i32> %tmp1, %tmp3
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @mul_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: mul_4s:
+;CHECK-NOT: dup
+;CHECK: mul.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %tmp4 = mul <4 x i32> %tmp1, %tmp3
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @mul_2d(<2 x i64> %A, <2 x i64> %B) nounwind {
+; CHECK-LABEL: mul_2d:
+; CHECK: mul
+; CHECK: mul
+ %tmp1 = mul <2 x i64> %A, %B
+ ret <2 x i64> %tmp1
+}
+
+define <2 x float> @fmul_lane_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fmul_lane_2s:
+;CHECK-NOT: dup
+;CHECK: fmul.2s
+ %tmp1 = load <2 x float>* %A
+ %tmp2 = load <2 x float>* %B
+ %tmp3 = shufflevector <2 x float> %tmp2, <2 x float> %tmp2, <2 x i32> <i32 1, i32 1>
+ %tmp4 = fmul <2 x float> %tmp1, %tmp3
+ ret <2 x float> %tmp4
+}
+
+define <4 x float> @fmul_lane_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fmul_lane_4s:
+;CHECK-NOT: dup
+;CHECK: fmul.4s
+ %tmp1 = load <4 x float>* %A
+ %tmp2 = load <4 x float>* %B
+ %tmp3 = shufflevector <4 x float> %tmp2, <4 x float> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %tmp4 = fmul <4 x float> %tmp1, %tmp3
+ ret <4 x float> %tmp4
+}
+
+define <2 x double> @fmul_lane_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fmul_lane_2d:
+;CHECK-NOT: dup
+;CHECK: fmul.2d
+ %tmp1 = load <2 x double>* %A
+ %tmp2 = load <2 x double>* %B
+ %tmp3 = shufflevector <2 x double> %tmp2, <2 x double> %tmp2, <2 x i32> <i32 1, i32 1>
+ %tmp4 = fmul <2 x double> %tmp1, %tmp3
+ ret <2 x double> %tmp4
+}
+
+define float @fmul_lane_s(float %A, <4 x float> %vec) nounwind {
+;CHECK-LABEL: fmul_lane_s:
+;CHECK-NOT: dup
+;CHECK: fmul.s s0, s0, v1[3]
+ %B = extractelement <4 x float> %vec, i32 3
+ %res = fmul float %A, %B
+ ret float %res
+}
+
+define double @fmul_lane_d(double %A, <2 x double> %vec) nounwind {
+;CHECK-LABEL: fmul_lane_d:
+;CHECK-NOT: dup
+;CHECK: fmul.d d0, d0, v1[1]
+ %B = extractelement <2 x double> %vec, i32 1
+ %res = fmul double %A, %B
+ ret double %res
+}
+
+
+
+define <2 x float> @fmulx_lane_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fmulx_lane_2s:
+;CHECK-NOT: dup
+;CHECK: fmulx.2s
+ %tmp1 = load <2 x float>* %A
+ %tmp2 = load <2 x float>* %B
+ %tmp3 = shufflevector <2 x float> %tmp2, <2 x float> %tmp2, <2 x i32> <i32 1, i32 1>
+ %tmp4 = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %tmp1, <2 x float> %tmp3)
+ ret <2 x float> %tmp4
+}
+
+define <4 x float> @fmulx_lane_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fmulx_lane_4s:
+;CHECK-NOT: dup
+;CHECK: fmulx.4s
+ %tmp1 = load <4 x float>* %A
+ %tmp2 = load <4 x float>* %B
+ %tmp3 = shufflevector <4 x float> %tmp2, <4 x float> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %tmp4 = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %tmp1, <4 x float> %tmp3)
+ ret <4 x float> %tmp4
+}
+
+define <2 x double> @fmulx_lane_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fmulx_lane_2d:
+;CHECK-NOT: dup
+;CHECK: fmulx.2d
+ %tmp1 = load <2 x double>* %A
+ %tmp2 = load <2 x double>* %B
+ %tmp3 = shufflevector <2 x double> %tmp2, <2 x double> %tmp2, <2 x i32> <i32 1, i32 1>
+ %tmp4 = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %tmp1, <2 x double> %tmp3)
+ ret <2 x double> %tmp4
+}
+
+define <4 x i16> @sqdmulh_lane_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqdmulh_lane_4h:
+;CHECK-NOT: dup
+;CHECK: sqdmulh.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %tmp4 = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp3)
+ ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @sqdmulh_lane_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqdmulh_lane_8h:
+;CHECK-NOT: dup
+;CHECK: sqdmulh.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %tmp4 = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp3)
+ ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @sqdmulh_lane_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqdmulh_lane_2s:
+;CHECK-NOT: dup
+;CHECK: sqdmulh.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+ %tmp4 = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp3)
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @sqdmulh_lane_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqdmulh_lane_4s:
+;CHECK-NOT: dup
+;CHECK: sqdmulh.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp3)
+ ret <4 x i32> %tmp4
+}
+
+define i32 @sqdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind {
+;CHECK-LABEL: sqdmulh_lane_1s:
+;CHECK-NOT: dup
+;CHECK: sqdmulh.s s0, {{s[0-9]+}}, {{v[0-9]+}}[1]
+ %tmp1 = extractelement <4 x i32> %B, i32 1
+ %tmp2 = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %A, i32 %tmp1)
+ ret i32 %tmp2
+}
+
+define <4 x i16> @sqrdmulh_lane_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqrdmulh_lane_4h:
+;CHECK-NOT: dup
+;CHECK: sqrdmulh.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %tmp4 = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp3)
+ ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @sqrdmulh_lane_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqrdmulh_lane_8h:
+;CHECK-NOT: dup
+;CHECK: sqrdmulh.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %tmp4 = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp3)
+ ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @sqrdmulh_lane_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqrdmulh_lane_2s:
+;CHECK-NOT: dup
+;CHECK: sqrdmulh.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+ %tmp4 = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp3)
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @sqrdmulh_lane_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqrdmulh_lane_4s:
+;CHECK-NOT: dup
+;CHECK: sqrdmulh.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp3)
+ ret <4 x i32> %tmp4
+}
+
+define i32 @sqrdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind {
+;CHECK-LABEL: sqrdmulh_lane_1s:
+;CHECK-NOT: dup
+;CHECK: sqrdmulh.s s0, {{s[0-9]+}}, {{v[0-9]+}}[1]
+ %tmp1 = extractelement <4 x i32> %B, i32 1
+ %tmp2 = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %A, i32 %tmp1)
+ ret i32 %tmp2
+}
+
+define <4 x i32> @sqdmull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqdmull_lane_4s:
+;CHECK-NOT: dup
+;CHECK: sqdmull.4s
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @sqdmull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqdmull_lane_2d:
+;CHECK-NOT: dup
+;CHECK: sqdmull.2d
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+ %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
+ ret <2 x i64> %tmp4
+}
+
+define <4 x i32> @sqdmull2_lane_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqdmull2_lane_4s:
+;CHECK-NOT: dup
+;CHECK: sqdmull2.4s
+ %load1 = load <8 x i16>* %A
+ %load2 = load <8 x i16>* %B
+ %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @sqdmull2_lane_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqdmull2_lane_2d:
+;CHECK-NOT: dup
+;CHECK: sqdmull2.2d
+ %load1 = load <4 x i32>* %A
+ %load2 = load <4 x i32>* %B
+ %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
+ %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i64> %tmp4
+}
+
+define <4 x i32> @umull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: umull_lane_4s:
+;CHECK-NOT: dup
+;CHECK: umull.4s
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @umull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: umull_lane_2d:
+;CHECK-NOT: dup
+;CHECK: umull.2d
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+ %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
+ ret <2 x i64> %tmp4
+}
+
+define <4 x i32> @smull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: smull_lane_4s:
+;CHECK-NOT: dup
+;CHECK: smull.4s
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @smull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: smull_lane_2d:
+;CHECK-NOT: dup
+;CHECK: smull.2d
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+ %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
+ ret <2 x i64> %tmp4
+}
+
+define <4 x i32> @smlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: smlal_lane_4s:
+;CHECK-NOT: dup
+;CHECK: smlal.4s
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = load <4 x i32>* %C
+ %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %tmp5 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
+ %tmp6 = add <4 x i32> %tmp3, %tmp5
+ ret <4 x i32> %tmp6
+}
+
+define <2 x i64> @smlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: smlal_lane_2d:
+;CHECK-NOT: dup
+;CHECK: smlal.2d
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = load <2 x i64>* %C
+ %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+ %tmp5 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
+ %tmp6 = add <2 x i64> %tmp3, %tmp5
+ ret <2 x i64> %tmp6
+}
+
+define <4 x i32> @sqdmlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sqdmlal_lane_4s:
+;CHECK-NOT: dup
+;CHECK: sqdmlal.4s
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = load <4 x i32>* %C
+ %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
+ %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
+ ret <4 x i32> %tmp6
+}
+
+define <2 x i64> @sqdmlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sqdmlal_lane_2d:
+;CHECK-NOT: dup
+;CHECK: sqdmlal.2d
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = load <2 x i64>* %C
+ %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+ %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
+ %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
+ ret <2 x i64> %tmp6
+}
+
+define <4 x i32> @sqdmlal2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sqdmlal2_lane_4s:
+;CHECK-NOT: dup
+;CHECK: sqdmlal2.4s
+ %load1 = load <8 x i16>* %A
+ %load2 = load <8 x i16>* %B
+ %tmp3 = load <4 x i32>* %C
+ %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
+ ret <4 x i32> %tmp6
+}
+
+define <2 x i64> @sqdmlal2_lane_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sqdmlal2_lane_2d:
+;CHECK-NOT: dup
+;CHECK: sqdmlal2.2d
+ %load1 = load <4 x i32>* %A
+ %load2 = load <4 x i32>* %B
+ %tmp3 = load <2 x i64>* %C
+ %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
+ %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
+ ret <2 x i64> %tmp6
+}
+
+define i32 @sqdmlal_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind {
+;CHECK-LABEL: sqdmlal_lane_1s:
+;CHECK: sqdmlal.4s
+ %lhs = insertelement <4 x i16> undef, i16 %B, i32 0
+ %rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs)
+ %prod = extractelement <4 x i32> %prod.vec, i32 0
+ %res = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %A, i32 %prod)
+ ret i32 %res
+}
+declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32)
+
+define i32 @sqdmlsl_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind {
+;CHECK-LABEL: sqdmlsl_lane_1s:
+;CHECK: sqdmlsl.4s
+ %lhs = insertelement <4 x i16> undef, i16 %B, i32 0
+ %rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs)
+ %prod = extractelement <4 x i32> %prod.vec, i32 0
+ %res = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %A, i32 %prod)
+ ret i32 %res
+}
+declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32)
+
+define i64 @sqdmlal_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind {
+;CHECK-LABEL: sqdmlal_lane_1d:
+;CHECK: sqdmlal.s
+ %rhs = extractelement <2 x i32> %C, i32 1
+ %prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
+ %res = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %A, i64 %prod)
+ ret i64 %res
+}
+declare i64 @llvm.aarch64.neon.sqdmulls.scalar(i32, i32)
+declare i64 @llvm.aarch64.neon.sqadd.i64(i64, i64)
+
+define i64 @sqdmlsl_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind {
+;CHECK-LABEL: sqdmlsl_lane_1d:
+;CHECK: sqdmlsl.s
+ %rhs = extractelement <2 x i32> %C, i32 1
+ %prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
+ %res = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %A, i64 %prod)
+ ret i64 %res
+}
+declare i64 @llvm.aarch64.neon.sqsub.i64(i64, i64)
+
+
+define <4 x i32> @umlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: umlal_lane_4s:
+;CHECK-NOT: dup
+;CHECK: umlal.4s
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = load <4 x i32>* %C
+ %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %tmp5 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
+ %tmp6 = add <4 x i32> %tmp3, %tmp5
+ ret <4 x i32> %tmp6
+}
+
+define <2 x i64> @umlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: umlal_lane_2d:
+;CHECK-NOT: dup
+;CHECK: umlal.2d
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = load <2 x i64>* %C
+ %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+ %tmp5 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
+ %tmp6 = add <2 x i64> %tmp3, %tmp5
+ ret <2 x i64> %tmp6
+}
+
+
+define <4 x i32> @smlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: smlsl_lane_4s:
+;CHECK-NOT: dup
+;CHECK: smlsl.4s
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = load <4 x i32>* %C
+ %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %tmp5 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
+ %tmp6 = sub <4 x i32> %tmp3, %tmp5
+ ret <4 x i32> %tmp6
+}
+
+define <2 x i64> @smlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: smlsl_lane_2d:
+;CHECK-NOT: dup
+;CHECK: smlsl.2d
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = load <2 x i64>* %C
+ %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+ %tmp5 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
+ %tmp6 = sub <2 x i64> %tmp3, %tmp5
+ ret <2 x i64> %tmp6
+}
+
+define <4 x i32> @sqdmlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sqdmlsl_lane_4s:
+;CHECK-NOT: dup
+;CHECK: sqdmlsl.4s
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = load <4 x i32>* %C
+ %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
+ %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
+ ret <4 x i32> %tmp6
+}
+
+define <2 x i64> @sqdmlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sqdmlsl_lane_2d:
+;CHECK-NOT: dup
+;CHECK: sqdmlsl.2d
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = load <2 x i64>* %C
+ %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+ %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
+ %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
+ ret <2 x i64> %tmp6
+}
+
+define <4 x i32> @sqdmlsl2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sqdmlsl2_lane_4s:
+;CHECK-NOT: dup
+;CHECK: sqdmlsl2.4s
+ %load1 = load <8 x i16>* %A
+ %load2 = load <8 x i16>* %B
+ %tmp3 = load <4 x i32>* %C
+ %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
+ ret <4 x i32> %tmp6
+}
+
+define <2 x i64> @sqdmlsl2_lane_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sqdmlsl2_lane_2d:
+;CHECK-NOT: dup
+;CHECK: sqdmlsl2.2d
+ %load1 = load <4 x i32>* %A
+ %load2 = load <4 x i32>* %B
+ %tmp3 = load <2 x i64>* %C
+ %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
+ %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
+ ret <2 x i64> %tmp6
+}
+
+define <4 x i32> @umlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: umlsl_lane_4s:
+;CHECK-NOT: dup
+;CHECK: umlsl.4s
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = load <4 x i32>* %C
+ %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %tmp5 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
+ %tmp6 = sub <4 x i32> %tmp3, %tmp5
+ ret <4 x i32> %tmp6
+}
+
+define <2 x i64> @umlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: umlsl_lane_2d:
+;CHECK-NOT: dup
+;CHECK: umlsl.2d
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = load <2 x i64>* %C
+ %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+ %tmp5 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
+ %tmp6 = sub <2 x i64> %tmp3, %tmp5
+ ret <2 x i64> %tmp6
+}
+
+; Scalar FMULX
+define float @fmulxs(float %a, float %b) nounwind {
+; CHECK-LABEL: fmulxs:
+; CHECKNEXT: fmulx s0, s0, s1
+ %fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind
+; CHECKNEXT: ret
+ ret float %fmulx.i
+}
+
+define double @fmulxd(double %a, double %b) nounwind {
+; CHECK-LABEL: fmulxd:
+; CHECKNEXT: fmulx d0, d0, d1
+ %fmulx.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) nounwind
+; CHECKNEXT: ret
+ ret double %fmulx.i
+}
+
+define float @fmulxs_lane(float %a, <4 x float> %vec) nounwind {
+; CHECK-LABEL: fmulxs_lane:
+; CHECKNEXT: fmulx.s s0, s0, v1[3]
+ %b = extractelement <4 x float> %vec, i32 3
+ %fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind
+; CHECKNEXT: ret
+ ret float %fmulx.i
+}
+
+define double @fmulxd_lane(double %a, <2 x double> %vec) nounwind {
+; CHECK-LABEL: fmulxd_lane:
+; CHECKNEXT: fmulx d0, d0, v1[1]
+ %b = extractelement <2 x double> %vec, i32 1
+ %fmulx.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) nounwind
+; CHECKNEXT: ret
+ ret double %fmulx.i
+}
+
+declare double @llvm.aarch64.neon.fmulx.f64(double, double) nounwind readnone
+declare float @llvm.aarch64.neon.fmulx.f32(float, float) nounwind readnone
+
+
+define <8 x i16> @smull2_8h_simple(<16 x i8> %a, <16 x i8> %b) nounwind {
+; CHECK-LABEL: smull2_8h_simple:
+; CHECK-NEXT: smull2.8h v0, v0, v1
+; CHECK-NEXT: ret
+ %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %2 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %3 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %1, <8 x i8> %2) #2
+ ret <8 x i16> %3
+}
+
+define <8 x i16> @foo0(<16 x i8> %a, <16 x i8> %b) nounwind {
+; CHECK-LABEL: foo0:
+; CHECK: smull2.8h v0, v0, v1
+ %tmp = bitcast <16 x i8> %a to <2 x i64>
+ %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+ %tmp1 = bitcast <1 x i64> %shuffle.i.i to <8 x i8>
+ %tmp2 = bitcast <16 x i8> %b to <2 x i64>
+ %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+ %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <8 x i8>
+ %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
+ ret <8 x i16> %vmull.i.i
+}
+
+define <4 x i32> @foo1(<8 x i16> %a, <8 x i16> %b) nounwind {
+; CHECK-LABEL: foo1:
+; CHECK: smull2.4s v0, v0, v1
+ %tmp = bitcast <8 x i16> %a to <2 x i64>
+ %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+ %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
+ %tmp2 = bitcast <8 x i16> %b to <2 x i64>
+ %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+ %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
+ %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
+ ret <4 x i32> %vmull2.i.i
+}
+
+define <2 x i64> @foo2(<4 x i32> %a, <4 x i32> %b) nounwind {
+; CHECK-LABEL: foo2:
+; CHECK: smull2.2d v0, v0, v1
+ %tmp = bitcast <4 x i32> %a to <2 x i64>
+ %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+ %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
+ %tmp2 = bitcast <4 x i32> %b to <2 x i64>
+ %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+ %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
+ %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
+ ret <2 x i64> %vmull2.i.i
+}
+
+define <8 x i16> @foo3(<16 x i8> %a, <16 x i8> %b) nounwind {
+; CHECK-LABEL: foo3:
+; CHECK: umull2.8h v0, v0, v1
+ %tmp = bitcast <16 x i8> %a to <2 x i64>
+ %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+ %tmp1 = bitcast <1 x i64> %shuffle.i.i to <8 x i8>
+ %tmp2 = bitcast <16 x i8> %b to <2 x i64>
+ %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+ %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <8 x i8>
+ %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
+ ret <8 x i16> %vmull.i.i
+}
+
+define <4 x i32> @foo4(<8 x i16> %a, <8 x i16> %b) nounwind {
+; CHECK-LABEL: foo4:
+; CHECK: umull2.4s v0, v0, v1
+ %tmp = bitcast <8 x i16> %a to <2 x i64>
+ %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+ %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
+ %tmp2 = bitcast <8 x i16> %b to <2 x i64>
+ %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+ %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
+ %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
+ ret <4 x i32> %vmull2.i.i
+}
+
+define <2 x i64> @foo5(<4 x i32> %a, <4 x i32> %b) nounwind {
+; CHECK-LABEL: foo5:
+; CHECK: umull2.2d v0, v0, v1
+ %tmp = bitcast <4 x i32> %a to <2 x i64>
+ %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+ %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
+ %tmp2 = bitcast <4 x i32> %b to <2 x i64>
+ %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+ %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
+ %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
+ ret <2 x i64> %vmull2.i.i
+}
+
+define <4 x i32> @foo6(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: foo6:
+; CHECK-NEXT: smull2.4s v0, v1, v2[1]
+; CHECK-NEXT: ret
+entry:
+ %0 = bitcast <8 x i16> %b to <2 x i64>
+ %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
+ %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
+ %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
+ ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @foo7(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: foo7:
+; CHECK-NEXT: smull2.2d v0, v1, v2[1]
+; CHECK-NEXT: ret
+entry:
+ %0 = bitcast <4 x i32> %b to <2 x i64>
+ %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
+ %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
+ %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
+ ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @foo8(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: foo8:
+; CHECK-NEXT: umull2.4s v0, v1, v2[1]
+; CHECK-NEXT: ret
+entry:
+ %0 = bitcast <8 x i16> %b to <2 x i64>
+ %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
+ %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
+ %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
+ ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @foo9(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: foo9:
+; CHECK-NEXT: umull2.2d v0, v1, v2[1]
+; CHECK-NEXT: ret
+entry:
+ %0 = bitcast <4 x i32> %b to <2 x i64>
+ %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
+ %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
+ %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
+ ret <2 x i64> %vmull2.i
+}
+
+define <8 x i16> @bar0(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind {
+; CHECK-LABEL: bar0:
+; CHECK: smlal2.8h v0, v1, v2
+; CHECK-NEXT: ret
+
+ %tmp = bitcast <16 x i8> %b to <2 x i64>
+ %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+ %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
+ %tmp2 = bitcast <16 x i8> %c to <2 x i64>
+ %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+ %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <8 x i8>
+ %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
+ %add.i = add <8 x i16> %vmull.i.i.i, %a
+ ret <8 x i16> %add.i
+}
+
+define <4 x i32> @bar1(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind {
+; CHECK-LABEL: bar1:
+; CHECK: smlal2.4s v0, v1, v2
+; CHECK-NEXT: ret
+
+ %tmp = bitcast <8 x i16> %b to <2 x i64>
+ %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+ %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
+ %tmp2 = bitcast <8 x i16> %c to <2 x i64>
+ %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+ %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <4 x i16>
+ %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
+ %add.i = add <4 x i32> %vmull2.i.i.i, %a
+ ret <4 x i32> %add.i
+}
+
+define <2 x i64> @bar2(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind {
+; CHECK-LABEL: bar2:
+; CHECK: smlal2.2d v0, v1, v2
+; CHECK-NEXT: ret
+
+ %tmp = bitcast <4 x i32> %b to <2 x i64>
+ %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+ %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
+ %tmp2 = bitcast <4 x i32> %c to <2 x i64>
+ %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+ %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <2 x i32>
+ %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
+ %add.i = add <2 x i64> %vmull2.i.i.i, %a
+ ret <2 x i64> %add.i
+}
+
+define <8 x i16> @bar3(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind {
+; CHECK-LABEL: bar3:
+; CHECK: umlal2.8h v0, v1, v2
+; CHECK-NEXT: ret
+
+ %tmp = bitcast <16 x i8> %b to <2 x i64>
+ %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+ %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
+ %tmp2 = bitcast <16 x i8> %c to <2 x i64>
+ %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+ %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <8 x i8>
+ %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
+ %add.i = add <8 x i16> %vmull.i.i.i, %a
+ ret <8 x i16> %add.i
+}
+
+define <4 x i32> @bar4(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind {
+; CHECK-LABEL: bar4:
+; CHECK: umlal2.4s v0, v1, v2
+; CHECK-NEXT: ret
+
+ %tmp = bitcast <8 x i16> %b to <2 x i64>
+ %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+ %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
+ %tmp2 = bitcast <8 x i16> %c to <2 x i64>
+ %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+ %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <4 x i16>
+ %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
+ %add.i = add <4 x i32> %vmull2.i.i.i, %a
+ ret <4 x i32> %add.i
+}
+
+define <2 x i64> @bar5(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind {
+; CHECK-LABEL: bar5:
+; CHECK: umlal2.2d v0, v1, v2
+; CHECK-NEXT: ret
+
+ %tmp = bitcast <4 x i32> %b to <2 x i64>
+ %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+ %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
+ %tmp2 = bitcast <4 x i32> %c to <2 x i64>
+ %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+ %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <2 x i32>
+ %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
+ %add.i = add <2 x i64> %vmull2.i.i.i, %a
+ ret <2 x i64> %add.i
+}
+
+define <4 x i32> @mlal2_1(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind {
+; CHECK-LABEL: mlal2_1:
+; CHECK: smlal2.4s v0, v1, v2[3]
+; CHECK-NEXT: ret
+ %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+ %tmp = bitcast <8 x i16> %b to <2 x i64>
+ %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+ %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
+ %tmp2 = bitcast <8 x i16> %shuffle to <2 x i64>
+ %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+ %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
+ %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
+ %add = add <4 x i32> %vmull2.i.i, %a
+ ret <4 x i32> %add
+}
+
+define <2 x i64> @mlal2_2(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind {
+; CHECK-LABEL: mlal2_2:
+; CHECK: smlal2.2d v0, v1, v2[1]
+; CHECK-NEXT: ret
+ %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %tmp = bitcast <4 x i32> %b to <2 x i64>
+ %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+ %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
+ %tmp2 = bitcast <4 x i32> %shuffle to <2 x i64>
+ %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+ %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
+ %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
+ %add = add <2 x i64> %vmull2.i.i, %a
+ ret <2 x i64> %add
+}
+
+define <4 x i32> @mlal2_4(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind {
+; CHECK-LABEL: mlal2_4:
+; CHECK: umlal2.4s v0, v1, v2[2]
+; CHECK-NEXT: ret
+
+ %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+ %tmp = bitcast <8 x i16> %b to <2 x i64>
+ %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+ %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
+ %tmp2 = bitcast <8 x i16> %shuffle to <2 x i64>
+ %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+ %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
+ %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
+ %add = add <4 x i32> %vmull2.i.i, %a
+ ret <4 x i32> %add
+}
+
+define <2 x i64> @mlal2_5(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind {
+; CHECK-LABEL: mlal2_5:
+; CHECK: umlal2.2d v0, v1, v2[0]
+; CHECK-NEXT: ret
+ %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> zeroinitializer
+ %tmp = bitcast <4 x i32> %b to <2 x i64>
+ %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+ %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
+ %tmp2 = bitcast <4 x i32> %shuffle to <2 x i64>
+ %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+ %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
+ %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
+ %add = add <2 x i64> %vmull2.i.i, %a
+ ret <2 x i64> %add
+}
+
+; rdar://12328502
+define <2 x double> @vmulq_n_f64(<2 x double> %x, double %y) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: vmulq_n_f64:
+; CHECK-NOT: dup.2d
+; CHECK: fmul.2d v0, v0, v1[0]
+ %vecinit.i = insertelement <2 x double> undef, double %y, i32 0
+ %vecinit1.i = insertelement <2 x double> %vecinit.i, double %y, i32 1
+ %mul.i = fmul <2 x double> %vecinit1.i, %x
+ ret <2 x double> %mul.i
+}
+
+define <4 x float> @vmulq_n_f32(<4 x float> %x, float %y) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: vmulq_n_f32:
+; CHECK-NOT: dup.4s
+; CHECK: fmul.4s v0, v0, v1[0]
+ %vecinit.i = insertelement <4 x float> undef, float %y, i32 0
+ %vecinit1.i = insertelement <4 x float> %vecinit.i, float %y, i32 1
+ %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %y, i32 2
+ %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %y, i32 3
+ %mul.i = fmul <4 x float> %vecinit3.i, %x
+ ret <4 x float> %mul.i
+}
+
+define <2 x float> @vmul_n_f32(<2 x float> %x, float %y) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: vmul_n_f32:
+; CHECK-NOT: dup.2s
+; CHECK: fmul.2s v0, v0, v1[0]
+ %vecinit.i = insertelement <2 x float> undef, float %y, i32 0
+ %vecinit1.i = insertelement <2 x float> %vecinit.i, float %y, i32 1
+ %mul.i = fmul <2 x float> %vecinit1.i, %x
+ ret <2 x float> %mul.i
+}
+
+define <4 x i16> @vmla_laneq_s16_test(<4 x i16> %a, <4 x i16> %b, <8 x i16> %c) nounwind readnone ssp {
+entry:
+; CHECK: vmla_laneq_s16_test
+; CHECK-NOT: ext
+; CHECK: mla.4h v0, v1, v2[6]
+; CHECK-NEXT: ret
+ %shuffle = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
+ %mul = mul <4 x i16> %shuffle, %b
+ %add = add <4 x i16> %mul, %a
+ ret <4 x i16> %add
+}
+
+define <2 x i32> @vmla_laneq_s32_test(<2 x i32> %a, <2 x i32> %b, <4 x i32> %c) nounwind readnone ssp {
+entry:
+; CHECK: vmla_laneq_s32_test
+; CHECK-NOT: ext
+; CHECK: mla.2s v0, v1, v2[3]
+; CHECK-NEXT: ret
+ %shuffle = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+ %mul = mul <2 x i32> %shuffle, %b
+ %add = add <2 x i32> %mul, %a
+ ret <2 x i32> %add
+}
+
+define <8 x i16> @not_really_vmlaq_laneq_s16_test(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) nounwind readnone ssp {
+entry:
+; CHECK: not_really_vmlaq_laneq_s16_test
+; CHECK-NOT: ext
+; CHECK: mla.8h v0, v1, v2[5]
+; CHECK-NEXT: ret
+ %shuffle1 = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %shuffle2 = shufflevector <4 x i16> %shuffle1, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %mul = mul <8 x i16> %shuffle2, %b
+ %add = add <8 x i16> %mul, %a
+ ret <8 x i16> %add
+}
+
+define <4 x i32> @not_really_vmlaq_laneq_s32_test(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind readnone ssp {
+entry:
+; CHECK: not_really_vmlaq_laneq_s32_test
+; CHECK-NOT: ext
+; CHECK: mla.4s v0, v1, v2[3]
+; CHECK-NEXT: ret
+ %shuffle1 = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ %shuffle2 = shufflevector <2 x i32> %shuffle1, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %mul = mul <4 x i32> %shuffle2, %b
+ %add = add <4 x i32> %mul, %a
+ ret <4 x i32> %add
+}
+
+define <4 x i32> @vmull_laneq_s16_test(<4 x i16> %a, <8 x i16> %b) nounwind readnone ssp {
+entry:
+; CHECK: vmull_laneq_s16_test
+; CHECK-NOT: ext
+; CHECK: smull.4s v0, v0, v1[6]
+; CHECK-NEXT: ret
+ %shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2
+ ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @vmull_laneq_s32_test(<2 x i32> %a, <4 x i32> %b) nounwind readnone ssp {
+entry:
+; CHECK: vmull_laneq_s32_test
+; CHECK-NOT: ext
+; CHECK: smull.2d v0, v0, v1[2]
+; CHECK-NEXT: ret
+ %shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 2>
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2
+ ret <2 x i64> %vmull2.i
+}
+define <4 x i32> @vmull_laneq_u16_test(<4 x i16> %a, <8 x i16> %b) nounwind readnone ssp {
+entry:
+; CHECK: vmull_laneq_u16_test
+; CHECK-NOT: ext
+; CHECK: umull.4s v0, v0, v1[6]
+; CHECK-NEXT: ret
+ %shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
+ %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2
+ ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @vmull_laneq_u32_test(<2 x i32> %a, <4 x i32> %b) nounwind readnone ssp {
+entry:
+; CHECK: vmull_laneq_u32_test
+; CHECK-NOT: ext
+; CHECK: umull.2d v0, v0, v1[2]
+; CHECK-NEXT: ret
+ %shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 2>
+ %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2
+ ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @vmull_high_n_s16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
+entry:
+; CHECK: vmull_high_n_s16_test
+; CHECK-NOT: ext
+; CHECK: smull2.4s
+; CHECK-NEXT: ret
+ %conv = trunc i32 %d to i16
+ %0 = bitcast <8 x i16> %b to <2 x i64>
+ %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
+ %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
+ %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0
+ %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
+ %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
+ %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
+ %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
+ ret <4 x i32> %vmull2.i.i
+}
+
+define <2 x i64> @vmull_high_n_s32_test(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c, i32 %d) nounwind readnone optsize ssp {
+entry:
+; CHECK: vmull_high_n_s32_test
+; CHECK-NOT: ext
+; CHECK: smull2.2d
+; CHECK-NEXT: ret
+ %0 = bitcast <4 x i32> %b to <2 x i64>
+ %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
+ %1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
+ %vecinit.i = insertelement <2 x i32> undef, i32 %d, i32 0
+ %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %d, i32 1
+ %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind
+ ret <2 x i64> %vmull2.i.i
+}
+
+define <4 x i32> @vmull_high_n_u16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
+entry:
+; CHECK: vmull_high_n_u16_test
+; CHECK-NOT: ext
+; CHECK: umull2.4s
+; CHECK-NEXT: ret
+ %conv = trunc i32 %d to i16
+ %0 = bitcast <8 x i16> %b to <2 x i64>
+ %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
+ %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
+ %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0
+ %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
+ %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
+ %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
+ %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
+ ret <4 x i32> %vmull2.i.i
+}
+
+define <2 x i64> @vmull_high_n_u32_test(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c, i32 %d) nounwind readnone optsize ssp {
+entry:
+; CHECK: vmull_high_n_u32_test
+; CHECK-NOT: ext
+; CHECK: umull2.2d
+; CHECK-NEXT: ret
+ %0 = bitcast <4 x i32> %b to <2 x i64>
+ %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
+ %1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
+ %vecinit.i = insertelement <2 x i32> undef, i32 %d, i32 0
+ %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %d, i32 1
+ %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind
+ ret <2 x i64> %vmull2.i.i
+}
+
+define <4 x i32> @vmul_built_dup_test(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: vmul_built_dup_test:
+; CHECK-NOT: ins
+; CHECK-NOT: dup
+; CHECK: mul.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[1]
+ %vget_lane = extractelement <4 x i32> %b, i32 1
+ %vecinit.i = insertelement <4 x i32> undef, i32 %vget_lane, i32 0
+ %vecinit1.i = insertelement <4 x i32> %vecinit.i, i32 %vget_lane, i32 1
+ %vecinit2.i = insertelement <4 x i32> %vecinit1.i, i32 %vget_lane, i32 2
+ %vecinit3.i = insertelement <4 x i32> %vecinit2.i, i32 %vget_lane, i32 3
+ %prod = mul <4 x i32> %a, %vecinit3.i
+ ret <4 x i32> %prod
+}
+
+define <4 x i16> @vmul_built_dup_fromsmall_test(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: vmul_built_dup_fromsmall_test:
+; CHECK-NOT: ins
+; CHECK-NOT: dup
+; CHECK: mul.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[3]
+ %vget_lane = extractelement <4 x i16> %b, i32 3
+ %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0
+ %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1
+ %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2
+ %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3
+ %prod = mul <4 x i16> %a, %vecinit3.i
+ ret <4 x i16> %prod
+}
+
+define <8 x i16> @vmulq_built_dup_fromsmall_test(<8 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: vmulq_built_dup_fromsmall_test:
+; CHECK-NOT: ins
+; CHECK-NOT: dup
+; CHECK: mul.8h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
+ %vget_lane = extractelement <4 x i16> %b, i32 0
+ %vecinit.i = insertelement <8 x i16> undef, i16 %vget_lane, i32 0
+ %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %vget_lane, i32 1
+ %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %vget_lane, i32 2
+ %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %vget_lane, i32 3
+ %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %vget_lane, i32 4
+ %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %vget_lane, i32 5
+ %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %vget_lane, i32 6
+ %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %vget_lane, i32 7
+ %prod = mul <8 x i16> %a, %vecinit7.i
+ ret <8 x i16> %prod
+}
+
+define <2 x i64> @mull_from_two_extracts(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: mull_from_two_extracts:
+; CHECK-NOT: ext
+; CHECK: sqdmull2.2d
+
+ %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+ %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @mlal_from_two_extracts(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: mlal_from_two_extracts:
+; CHECK-NOT: ext
+; CHECK: sqdmlal2.2d
+
+ %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+ %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+ %sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
+ ret <2 x i64> %sum
+}
+
+define <2 x i64> @mull_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: mull_from_extract_dup:
+; CHECK-NOT: ext
+; CHECK: sqdmull2.2d
+ %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+ %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+ %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+ %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
+ ret <2 x i64> %res
+}
+
+define <8 x i16> @pmull_from_extract_dup(<16 x i8> %lhs, i8 %rhs) {
+; CHECK-LABEL: pmull_from_extract_dup:
+; CHECK-NOT: ext
+; CHECK: pmull2.8h
+ %rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0
+ %rhsvec = shufflevector <8 x i8> %rhsvec.0, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+
+ %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+ %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhsvec) nounwind
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @pmull_from_extract_duplane(<16 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK-LABEL: pmull_from_extract_duplane:
+; CHECK-NOT: ext
+; CHECK: pmull2.8h
+
+ %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %rhs.high = shufflevector <8 x i8> %rhs, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+
+ %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhs.high) nounwind
+ ret <8 x i16> %res
+}
+
+define <2 x i64> @sqdmull_from_extract_duplane(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: sqdmull_from_extract_duplane:
+; CHECK-NOT: ext
+; CHECK: sqdmull2.2d
+
+ %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
+
+ %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @sqdmlal_from_extract_duplane(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: sqdmlal_from_extract_duplane:
+; CHECK-NOT: ext
+; CHECK: sqdmlal2.2d
+
+ %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
+
+ %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+ %sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
+ ret <2 x i64> %sum
+}
+
+define <2 x i64> @umlal_from_extract_duplane(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: umlal_from_extract_duplane:
+; CHECK-NOT: ext
+; CHECK: umlal2.2d
+
+ %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
+
+ %res = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+ %sum = add <2 x i64> %accum, %res
+ ret <2 x i64> %sum
+}
+
+define float @scalar_fmla_from_extract_v4f32(float %accum, float %lhs, <4 x float> %rvec) {
+; CHECK-LABEL: scalar_fmla_from_extract_v4f32:
+; CHECK: fmla.s s0, s1, v2[3]
+ %rhs = extractelement <4 x float> %rvec, i32 3
+ %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
+ ret float %res
+}
+
+define float @scalar_fmla_from_extract_v2f32(float %accum, float %lhs, <2 x float> %rvec) {
+; CHECK-LABEL: scalar_fmla_from_extract_v2f32:
+; CHECK: fmla.s s0, s1, v2[1]
+ %rhs = extractelement <2 x float> %rvec, i32 1
+ %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
+ ret float %res
+}
+
+define float @scalar_fmls_from_extract_v4f32(float %accum, float %lhs, <4 x float> %rvec) {
+; CHECK-LABEL: scalar_fmls_from_extract_v4f32:
+; CHECK: fmls.s s0, s1, v2[3]
+ %rhs.scal = extractelement <4 x float> %rvec, i32 3
+ %rhs = fsub float -0.0, %rhs.scal
+ %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
+ ret float %res
+}
+
+define float @scalar_fmls_from_extract_v2f32(float %accum, float %lhs, <2 x float> %rvec) {
+; CHECK-LABEL: scalar_fmls_from_extract_v2f32:
+; CHECK: fmls.s s0, s1, v2[1]
+ %rhs.scal = extractelement <2 x float> %rvec, i32 1
+ %rhs = fsub float -0.0, %rhs.scal
+ %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
+ ret float %res
+}
+
+declare float @llvm.fma.f32(float, float, float)
+
+define double @scalar_fmla_from_extract_v2f64(double %accum, double %lhs, <2 x double> %rvec) {
+; CHECK-LABEL: scalar_fmla_from_extract_v2f64:
+; CHECK: fmla.d d0, d1, v2[1]
+ %rhs = extractelement <2 x double> %rvec, i32 1
+ %res = call double @llvm.fma.f64(double %lhs, double %rhs, double %accum)
+ ret double %res
+}
+
+define double @scalar_fmls_from_extract_v2f64(double %accum, double %lhs, <2 x double> %rvec) {
+; CHECK-LABEL: scalar_fmls_from_extract_v2f64:
+; CHECK: fmls.d d0, d1, v2[1]
+ %rhs.scal = extractelement <2 x double> %rvec, i32 1
+ %rhs = fsub double -0.0, %rhs.scal
+ %res = call double @llvm.fma.f64(double %lhs, double %rhs, double %accum)
+ ret double %res
+}
+
+declare double @llvm.fma.f64(double, double, double)
+
+define <2 x float> @fmls_with_fneg_before_extract_v2f32(<2 x float> %accum, <2 x float> %lhs, <4 x float> %rhs) {
+; CHECK-LABEL: fmls_with_fneg_before_extract_v2f32:
+; CHECK: fmls.2s v0, v1, v2[3]
+ %rhs_neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %rhs
+ %splat = shufflevector <4 x float> %rhs_neg, <4 x float> undef, <2 x i32> <i32 3, i32 3>
+ %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %splat, <2 x float> %accum)
+ ret <2 x float> %res
+}
+
+define <2 x float> @fmls_with_fneg_before_extract_v2f32_1(<2 x float> %accum, <2 x float> %lhs, <2 x float> %rhs) {
+; CHECK-LABEL: fmls_with_fneg_before_extract_v2f32_1:
+; CHECK: fmls.2s v0, v1, v2[1]
+ %rhs_neg = fsub <2 x float> <float -0.0, float -0.0>, %rhs
+ %splat = shufflevector <2 x float> %rhs_neg, <2 x float> undef, <2 x i32> <i32 1, i32 1>
+ %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %splat, <2 x float> %accum)
+ ret <2 x float> %res
+}
+
+define <4 x float> @fmls_with_fneg_before_extract_v4f32(<4 x float> %accum, <4 x float> %lhs, <4 x float> %rhs) {
+; CHECK-LABEL: fmls_with_fneg_before_extract_v4f32:
+; CHECK: fmls.4s v0, v1, v2[3]
+ %rhs_neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %rhs
+ %splat = shufflevector <4 x float> %rhs_neg, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+ %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %splat, <4 x float> %accum)
+ ret <4 x float> %res
+}
+
+define <4 x float> @fmls_with_fneg_before_extract_v4f32_1(<4 x float> %accum, <4 x float> %lhs, <2 x float> %rhs) {
+; CHECK-LABEL: fmls_with_fneg_before_extract_v4f32_1:
+; CHECK: fmls.4s v0, v1, v2[1]
+ %rhs_neg = fsub <2 x float> <float -0.0, float -0.0>, %rhs
+ %splat = shufflevector <2 x float> %rhs_neg, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %splat, <4 x float> %accum)
+ ret <4 x float> %res
+}
+
+define <2 x double> @fmls_with_fneg_before_extract_v2f64(<2 x double> %accum, <2 x double> %lhs, <2 x double> %rhs) {
+; CHECK-LABEL: fmls_with_fneg_before_extract_v2f64:
+; CHECK: fmls.2d v0, v1, v2[1]
+ %rhs_neg = fsub <2 x double> <double -0.0, double -0.0>, %rhs
+ %splat = shufflevector <2 x double> %rhs_neg, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+ %res = call <2 x double> @llvm.fma.v2f64(<2 x double> %lhs, <2 x double> %splat, <2 x double> %accum)
+ ret <2 x double> %res
+}
+
+define <1 x double> @test_fmul_v1f64(<1 x double> %L, <1 x double> %R) nounwind {
+; CHECK-LABEL: test_fmul_v1f64:
+; CHECK: fmul
+ %prod = fmul <1 x double> %L, %R
+ ret <1 x double> %prod
+}
+
+define <1 x double> @test_fdiv_v1f64(<1 x double> %L, <1 x double> %R) nounwind {
+; CHECK-LABEL: test_fdiv_v1f64:
+; CHECK-LABEL: fdiv
+ %prod = fdiv <1 x double> %L, %R
+ ret <1 x double> %prod
+}
+
+define i64 @sqdmlal_d(i32 %A, i32 %B, i64 %C) nounwind {
+;CHECK-LABEL: sqdmlal_d:
+;CHECK: sqdmlal
+ %tmp4 = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %A, i32 %B)
+ %tmp5 = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %C, i64 %tmp4)
+ ret i64 %tmp5
+}
+
+define i64 @sqdmlsl_d(i32 %A, i32 %B, i64 %C) nounwind {
+;CHECK-LABEL: sqdmlsl_d:
+;CHECK: sqdmlsl
+ %tmp4 = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %A, i32 %B)
+ %tmp5 = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %C, i64 %tmp4)
+ ret i64 %tmp5
+}
+
+define <16 x i8> @test_pmull_64(i64 %l, i64 %r) nounwind {
+; CHECK-LABEL: test_pmull_64:
+; CHECK: pmull.1q
+ %val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l, i64 %r)
+ ret <16 x i8> %val
+}
+
+define <16 x i8> @test_pmull_high_64(<2 x i64> %l, <2 x i64> %r) nounwind {
+; CHECK-LABEL: test_pmull_high_64:
+; CHECK: pmull2.1q
+ %l_hi = extractelement <2 x i64> %l, i32 1
+ %r_hi = extractelement <2 x i64> %r, i32 1
+ %val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l_hi, i64 %r_hi)
+ ret <16 x i8> %val
+}
+
+declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64)
+
+define <1 x i64> @test_mul_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) nounwind {
+; CHECK-LABEL: test_mul_v1i64:
+; CHECK: mul
+ %prod = mul <1 x i64> %lhs, %rhs
+ ret <1 x i64> %prod
+}
diff --git a/test/CodeGen/AArch64/arm64-volatile.ll b/test/CodeGen/AArch64/arm64-volatile.ll
new file mode 100644
index 0000000..e00ac5a
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-volatile.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+define i64 @normal_load(i64* nocapture %bar) nounwind readonly {
+; CHECK: normal_load
+; CHECK: ldp
+; CHECK-NEXT: add
+; CHECK-NEXT: ret
+ %add.ptr = getelementptr inbounds i64* %bar, i64 1
+ %tmp = load i64* %add.ptr, align 8
+ %add.ptr1 = getelementptr inbounds i64* %bar, i64 2
+ %tmp1 = load i64* %add.ptr1, align 8
+ %add = add nsw i64 %tmp1, %tmp
+ ret i64 %add
+}
+
+define i64 @volatile_load(i64* nocapture %bar) nounwind {
+; CHECK: volatile_load
+; CHECK: ldr
+; CHECK-NEXT: ldr
+; CHECK-NEXT: add
+; CHECK-NEXT: ret
+ %add.ptr = getelementptr inbounds i64* %bar, i64 1
+ %tmp = load volatile i64* %add.ptr, align 8
+ %add.ptr1 = getelementptr inbounds i64* %bar, i64 2
+ %tmp1 = load volatile i64* %add.ptr1, align 8
+ %add = add nsw i64 %tmp1, %tmp
+ ret i64 %add
+}
diff --git a/test/CodeGen/AArch64/arm64-vpopcnt.ll b/test/CodeGen/AArch64/arm64-vpopcnt.ll
new file mode 100644
index 0000000..25306eb
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vpopcnt.ll
@@ -0,0 +1,68 @@
+; RUN: llc < %s -march=arm64 -mcpu=cyclone | FileCheck %s
+target triple = "arm64-apple-ios"
+
+; The non-byte ones used to fail with "Cannot select"
+
+; CHECK-LABEL: ctpopv8i8
+; CHECK: cnt.8b
+define <8 x i8> @ctpopv8i8(<8 x i8> %x) nounwind readnone {
+ %cnt = tail call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %x)
+ ret <8 x i8> %cnt
+}
+
+declare <8 x i8> @llvm.ctpop.v8i8(<8 x i8>) nounwind readnone
+
+; CHECK-LABEL: ctpopv4i16
+; CHECK: cnt.8b
+define <4 x i16> @ctpopv4i16(<4 x i16> %x) nounwind readnone {
+ %cnt = tail call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %x)
+ ret <4 x i16> %cnt
+}
+
+declare <4 x i16> @llvm.ctpop.v4i16(<4 x i16>) nounwind readnone
+
+; CHECK-LABEL: ctpopv2i32
+; CHECK: cnt.8b
+define <2 x i32> @ctpopv2i32(<2 x i32> %x) nounwind readnone {
+ %cnt = tail call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %x)
+ ret <2 x i32> %cnt
+}
+
+declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>) nounwind readnone
+
+
+; CHECK-LABEL: ctpopv16i8
+; CHECK: cnt.16b
+define <16 x i8> @ctpopv16i8(<16 x i8> %x) nounwind readnone {
+ %cnt = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %x)
+ ret <16 x i8> %cnt
+}
+
+declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>) nounwind readnone
+
+; CHECK-LABEL: ctpopv8i16
+; CHECK: cnt.8b
+define <8 x i16> @ctpopv8i16(<8 x i16> %x) nounwind readnone {
+ %cnt = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %x)
+ ret <8 x i16> %cnt
+}
+
+declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>) nounwind readnone
+
+; CHECK-LABEL: ctpopv4i32
+; CHECK: cnt.8b
+define <4 x i32> @ctpopv4i32(<4 x i32> %x) nounwind readnone {
+ %cnt = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %x)
+ ret <4 x i32> %cnt
+}
+
+declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) nounwind readnone
+
+; CHECK-LABEL: ctpopv2i64
+; CHECK: cnt.8b
+define <2 x i64> @ctpopv2i64(<2 x i64> %x) nounwind readnone {
+ %cnt = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %x)
+ ret <2 x i64> %cnt
+}
+
+declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-vqadd.ll b/test/CodeGen/AArch64/arm64-vqadd.ll
new file mode 100644
index 0000000..20f7e2c
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vqadd.ll
@@ -0,0 +1,332 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @sqadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: sqadd8b:
+;CHECK: sqadd.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqadd4h:
+;CHECK: sqadd.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqadd2s:
+;CHECK: sqadd.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @uqadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uqadd8b:
+;CHECK: uqadd.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @uqadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uqadd4h:
+;CHECK: uqadd.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uqadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uqadd2s:
+;CHECK: uqadd.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: sqadd16b:
+;CHECK: sqadd.16b
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @sqadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqadd8h:
+;CHECK: sqadd.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @sqadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqadd4s:
+;CHECK: sqadd.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sqadd2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: sqadd2d:
+;CHECK: sqadd.2d
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i64>* %B
+ %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+ ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @uqadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uqadd16b:
+;CHECK: uqadd.16b
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.uqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @uqadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uqadd8h:
+;CHECK: uqadd.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.uqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @uqadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uqadd4s:
+;CHECK: uqadd.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.uqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @uqadd2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: uqadd2d:
+;CHECK: uqadd.2d
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i64>* %B
+ %tmp3 = call <2 x i64> @llvm.aarch64.neon.uqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+ ret <2 x i64> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.sqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.sqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8> @llvm.aarch64.neon.uqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.uqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.sqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.uqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.uqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.uqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i8> @usqadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: usqadd8b:
+;CHECK: usqadd.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.usqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @usqadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: usqadd4h:
+;CHECK: usqadd.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.usqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @usqadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: usqadd2s:
+;CHECK: usqadd.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.usqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @usqadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: usqadd16b:
+;CHECK: usqadd.16b
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.usqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @usqadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: usqadd8h:
+;CHECK: usqadd.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.usqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @usqadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: usqadd4s:
+;CHECK: usqadd.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.usqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @usqadd2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: usqadd2d:
+;CHECK: usqadd.2d
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i64>* %B
+ %tmp3 = call <2 x i64> @llvm.aarch64.neon.usqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+ ret <2 x i64> %tmp3
+}
+
+define i64 @usqadd_d(i64 %l, i64 %r) nounwind {
+; CHECK-LABEL: usqadd_d:
+; CHECK: usqadd {{d[0-9]+}}, {{d[0-9]+}}
+ %sum = call i64 @llvm.aarch64.neon.usqadd.i64(i64 %l, i64 %r)
+ ret i64 %sum
+}
+
+define i32 @usqadd_s(i32 %l, i32 %r) nounwind {
+; CHECK-LABEL: usqadd_s:
+; CHECK: usqadd {{s[0-9]+}}, {{s[0-9]+}}
+ %sum = call i32 @llvm.aarch64.neon.usqadd.i32(i32 %l, i32 %r)
+ ret i32 %sum
+}
+
+declare <8 x i8> @llvm.aarch64.neon.usqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.usqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.usqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.usqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+declare i64 @llvm.aarch64.neon.usqadd.i64(i64, i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.usqadd.i32(i32, i32) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.usqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.usqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.usqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.usqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i8> @suqadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: suqadd8b:
+;CHECK: suqadd.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @suqadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: suqadd4h:
+;CHECK: suqadd.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @suqadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: suqadd2s:
+;CHECK: suqadd.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.suqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @suqadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: suqadd16b:
+;CHECK: suqadd.16b
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.suqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @suqadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: suqadd8h:
+;CHECK: suqadd.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.suqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @suqadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: suqadd4s:
+;CHECK: suqadd.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.suqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @suqadd2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: suqadd2d:
+;CHECK: suqadd.2d
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i64>* %B
+ %tmp3 = call <2 x i64> @llvm.aarch64.neon.suqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+ ret <2 x i64> %tmp3
+}
+
+define <1 x i64> @suqadd_1d(<1 x i64> %l, <1 x i64> %r) nounwind {
+; CHECK-LABEL: suqadd_1d:
+; CHECK: suqadd {{d[0-9]+}}, {{d[0-9]+}}
+ %sum = call <1 x i64> @llvm.aarch64.neon.suqadd.v1i64(<1 x i64> %l, <1 x i64> %r)
+ ret <1 x i64> %sum
+}
+
+define i64 @suqadd_d(i64 %l, i64 %r) nounwind {
+; CHECK-LABEL: suqadd_d:
+; CHECK: suqadd {{d[0-9]+}}, {{d[0-9]+}}
+ %sum = call i64 @llvm.aarch64.neon.suqadd.i64(i64 %l, i64 %r)
+ ret i64 %sum
+}
+
+define i32 @suqadd_s(i32 %l, i32 %r) nounwind {
+; CHECK-LABEL: suqadd_s:
+; CHECK: suqadd {{s[0-9]+}}, {{s[0-9]+}}
+ %sum = call i32 @llvm.aarch64.neon.suqadd.i32(i32 %l, i32 %r)
+ ret i32 %sum
+}
+
+declare <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.suqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.suqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+declare i64 @llvm.aarch64.neon.suqadd.i64(i64, i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.suqadd.i32(i32, i32) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.suqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.suqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.suqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.suqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-vqsub.ll b/test/CodeGen/AArch64/arm64-vqsub.ll
new file mode 100644
index 0000000..dde3ac3
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vqsub.ll
@@ -0,0 +1,147 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @sqsub8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: sqsub8b:
+;CHECK: sqsub.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqsub4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqsub4h:
+;CHECK: sqsub.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqsub2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqsub2s:
+;CHECK: sqsub.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @uqsub8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uqsub8b:
+;CHECK: uqsub.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @uqsub4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uqsub4h:
+;CHECK: uqsub.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uqsub2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uqsub2s:
+;CHECK: uqsub.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqsub16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: sqsub16b:
+;CHECK: sqsub.16b
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @sqsub8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqsub8h:
+;CHECK: sqsub.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @sqsub4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqsub4s:
+;CHECK: sqsub.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sqsub2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: sqsub2d:
+;CHECK: sqsub.2d
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i64>* %B
+ %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+ ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @uqsub16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uqsub16b:
+;CHECK: uqsub.16b
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.uqsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @uqsub8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uqsub8h:
+;CHECK: uqsub.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.uqsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @uqsub4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uqsub4s:
+;CHECK: uqsub.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.uqsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @uqsub2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: uqsub2d:
+;CHECK: uqsub.2d
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i64>* %B
+ %tmp3 = call <2 x i64> @llvm.aarch64.neon.uqsub.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+ ret <2 x i64> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.sqsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.sqsub.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8> @llvm.aarch64.neon.uqsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uqsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uqsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.uqsub.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.sqsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.uqsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.uqsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uqsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.uqsub.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-vselect.ll b/test/CodeGen/AArch64/arm64-vselect.ll
new file mode 100644
index 0000000..9988512
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vselect.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+;CHECK: @func63
+;CHECK: cmeq.4h v0, v0, v1
+
+;FIXME: currently, it will generate 3 instructions:
+; ushll.4s v0, v0, #0
+; shl.4s v0, v0, #31
+; sshr.4s v0, v0, #31
+;But these instrucitons can be optimized into 1 instruction:
+; sshll.4s v0, v0, #0
+
+;CHECK: bsl.16b v0, v2, v3
+;CHECK: str q0, [x0]
+;CHECK: ret
+
+%T0_63 = type <4 x i16>
+%T1_63 = type <4 x i32>
+%T2_63 = type <4 x i1>
+define void @func63(%T1_63* %out, %T0_63 %v0, %T0_63 %v1, %T1_63 %v2, %T1_63 %v3) {
+ %cond = icmp eq %T0_63 %v0, %v1
+ %r = select %T2_63 %cond, %T1_63 %v2, %T1_63 %v3
+ store %T1_63 %r, %T1_63* %out
+ ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-vsetcc_fp.ll b/test/CodeGen/AArch64/arm64-vsetcc_fp.ll
new file mode 100644
index 0000000..f4f4714
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vsetcc_fp.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+define <2 x i32> @fcmp_one(<2 x float> %x, <2 x float> %y) nounwind optsize readnone {
+; CHECK-LABEL: fcmp_one:
+; CHECK-NEXT: fcmgt.2s [[REG:v[0-9]+]], v0, v1
+; CHECK-NEXT: fcmgt.2s [[REG2:v[0-9]+]], v1, v0
+; CHECK-NEXT: orr.8b v0, [[REG2]], [[REG]]
+; CHECK-NEXT: ret
+ %tmp = fcmp one <2 x float> %x, %y
+ %or = sext <2 x i1> %tmp to <2 x i32>
+ ret <2 x i32> %or
+}
diff --git a/test/CodeGen/AArch64/arm64-vshift.ll b/test/CodeGen/AArch64/arm64-vshift.ll
new file mode 100644
index 0000000..82ae486
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vshift.ll
@@ -0,0 +1,1917 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -enable-misched=false | FileCheck %s
+
+define <8 x i8> @sqshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: sqshl8b:
+;CHECK: sqshl.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqshl4h:
+;CHECK: sqshl.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqshl2s:
+;CHECK: sqshl.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @uqshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uqshl8b:
+;CHECK: uqshl.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @uqshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uqshl4h:
+;CHECK: uqshl.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uqshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uqshl2s:
+;CHECK: uqshl.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: sqshl16b:
+;CHECK: sqshl.16b
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @sqshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqshl8h:
+;CHECK: sqshl.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @sqshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqshl4s:
+;CHECK: sqshl.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sqshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: sqshl2d:
+;CHECK: sqshl.2d
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i64>* %B
+ %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+ ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @uqshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uqshl16b:
+;CHECK: uqshl.16b
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @uqshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uqshl8h:
+;CHECK: uqshl.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @uqshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uqshl4s:
+;CHECK: uqshl.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @uqshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: uqshl2d:
+;CHECK: uqshl.2d
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i64>* %B
+ %tmp3 = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+ ret <2 x i64> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.sqshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.uqshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i8> @srshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: srshl8b:
+;CHECK: srshl.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @srshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: srshl4h:
+;CHECK: srshl.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @srshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: srshl2s:
+;CHECK: srshl.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @urshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: urshl8b:
+;CHECK: urshl.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @urshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: urshl4h:
+;CHECK: urshl.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @urshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: urshl2s:
+;CHECK: urshl.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @srshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: srshl16b:
+;CHECK: srshl.16b
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @srshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: srshl8h:
+;CHECK: srshl.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @srshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: srshl4s:
+;CHECK: srshl.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @srshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: srshl2d:
+;CHECK: srshl.2d
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i64>* %B
+ %tmp3 = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+ ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @urshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: urshl16b:
+;CHECK: urshl.16b
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @urshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: urshl8h:
+;CHECK: urshl.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @urshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: urshl4s:
+;CHECK: urshl.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @urshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: urshl2d:
+;CHECK: urshl.2d
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i64>* %B
+ %tmp3 = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+ ret <2 x i64> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i8> @sqrshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: sqrshl8b:
+;CHECK: sqrshl.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqrshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqrshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqrshl4h:
+;CHECK: sqrshl.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqrshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqrshl2s:
+;CHECK: sqrshl.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @uqrshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uqrshl8b:
+;CHECK: uqrshl.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqrshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @uqrshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uqrshl4h:
+;CHECK: uqrshl.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqrshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uqrshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uqrshl2s:
+;CHECK: uqrshl.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqrshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqrshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: sqrshl16b:
+;CHECK: sqrshl.16b
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqrshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @sqrshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqrshl8h:
+;CHECK: sqrshl.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqrshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @sqrshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqrshl4s:
+;CHECK: sqrshl.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqrshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sqrshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: sqrshl2d:
+;CHECK: sqrshl.2d
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i64>* %B
+ %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqrshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+ ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @uqrshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uqrshl16b:
+;CHECK: uqrshl.16b
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.uqrshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+ ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @uqrshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uqrshl8h:
+;CHECK: uqrshl.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.uqrshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @uqrshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uqrshl4s:
+;CHECK: uqrshl.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.uqrshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @uqrshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: uqrshl2d:
+;CHECK: uqrshl.2d
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i64>* %B
+ %tmp3 = call <2 x i64> @llvm.aarch64.neon.uqrshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+ ret <2 x i64> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.sqrshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqrshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqrshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.sqrshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8> @llvm.aarch64.neon.uqrshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uqrshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uqrshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.uqrshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.sqrshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqrshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqrshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.sqrshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.uqrshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.uqrshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uqrshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.uqrshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i8> @urshr8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: urshr8b:
+;CHECK: urshr.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @urshr4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: urshr4h:
+;CHECK: urshr.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @urshr2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: urshr2s:
+;CHECK: urshr.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
+ ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @urshr16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: urshr16b:
+;CHECK: urshr.16b
+ %tmp1 = load <16 x i8>* %A
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+ ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @urshr8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: urshr8h:
+;CHECK: urshr.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @urshr4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: urshr4s:
+;CHECK: urshr.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @urshr2d(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: urshr2d:
+;CHECK: urshr.2d
+ %tmp1 = load <2 x i64>* %A
+ %tmp3 = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
+ ret <2 x i64> %tmp3
+}
+
+define <8 x i8> @srshr8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: srshr8b:
+;CHECK: srshr.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @srshr4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: srshr4h:
+;CHECK: srshr.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @srshr2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: srshr2s:
+;CHECK: srshr.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
+ ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @srshr16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: srshr16b:
+;CHECK: srshr.16b
+ %tmp1 = load <16 x i8>* %A
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+ ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @srshr8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: srshr8h:
+;CHECK: srshr.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @srshr4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: srshr4s:
+;CHECK: srshr.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @srshr2d(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: srshr2d:
+;CHECK: srshr.2d
+ %tmp1 = load <2 x i64>* %A
+ %tmp3 = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
+ ret <2 x i64> %tmp3
+}
+
+define <8 x i8> @sqshlu8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: sqshlu8b:
+;CHECK: sqshlu.8b v0, {{v[0-9]+}}, #1
+ %tmp1 = load <8 x i8>* %A
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqshlu4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: sqshlu4h:
+;CHECK: sqshlu.4h v0, {{v[0-9]+}}, #1
+ %tmp1 = load <4 x i16>* %A
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqshlu2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: sqshlu2s:
+;CHECK: sqshlu.2s v0, {{v[0-9]+}}, #1
+ %tmp1 = load <2 x i32>* %A
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshlu.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 1, i32 1>)
+ ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqshlu16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: sqshlu16b:
+;CHECK: sqshlu.16b v0, {{v[0-9]+}}, #1
+ %tmp1 = load <16 x i8>* %A
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqshlu.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+ ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @sqshlu8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqshlu8h:
+;CHECK: sqshlu.8h v0, {{v[0-9]+}}, #1
+ %tmp1 = load <8 x i16>* %A
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqshlu.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @sqshlu4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqshlu4s:
+;CHECK: sqshlu.4s v0, {{v[0-9]+}}, #1
+ %tmp1 = load <4 x i32>* %A
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqshlu.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sqshlu2d(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqshlu2d:
+;CHECK: sqshlu.2d v0, {{v[0-9]+}}, #1
+ %tmp1 = load <2 x i64>* %A
+ %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqshlu.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 1, i64 1>)
+ ret <2 x i64> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqshlu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.sqshlu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.sqshlu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqshlu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqshlu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.sqshlu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i8> @rshrn8b(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: rshrn8b:
+;CHECK: rshrn.8b v0, {{v[0-9]+}}, #1
+ %tmp1 = load <8 x i16>* %A
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %tmp1, i32 1)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @rshrn4h(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: rshrn4h:
+;CHECK: rshrn.4h v0, {{v[0-9]+}}, #1
+ %tmp1 = load <4 x i32>* %A
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> %tmp1, i32 1)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @rshrn2s(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: rshrn2s:
+;CHECK: rshrn.2s v0, {{v[0-9]+}}, #1
+ %tmp1 = load <2 x i64>* %A
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> %tmp1, i32 1)
+ ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @rshrn16b(<8 x i8> *%ret, <8 x i16>* %A) nounwind {
+;CHECK-LABEL: rshrn16b:
+;CHECK: rshrn2.16b v0, {{v[0-9]+}}, #1
+ %out = load <8 x i8>* %ret
+ %tmp1 = load <8 x i16>* %A
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %tmp1, i32 1)
+ %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @rshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
+;CHECK-LABEL: rshrn8h:
+;CHECK: rshrn2.8h v0, {{v[0-9]+}}, #1
+ %out = load <4 x i16>* %ret
+ %tmp1 = load <4 x i32>* %A
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> %tmp1, i32 1)
+ %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @rshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
+;CHECK-LABEL: rshrn4s:
+;CHECK: rshrn2.4s v0, {{v[0-9]+}}, #1
+ %out = load <2 x i32>* %ret
+ %tmp1 = load <2 x i64>* %A
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> %tmp1, i32 1)
+ %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x i32> %tmp4
+}
+
+declare <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64>, i32) nounwind readnone
+
+define <8 x i8> @shrn8b(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: shrn8b:
+;CHECK: shrn.8b v0, {{v[0-9]+}}, #1
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = lshr <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %tmp3 = trunc <8 x i16> %tmp2 to <8 x i8>
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @shrn4h(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: shrn4h:
+;CHECK: shrn.4h v0, {{v[0-9]+}}, #1
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = lshr <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
+ %tmp3 = trunc <4 x i32> %tmp2 to <4 x i16>
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @shrn2s(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: shrn2s:
+;CHECK: shrn.2s v0, {{v[0-9]+}}, #1
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = lshr <2 x i64> %tmp1, <i64 1, i64 1>
+ %tmp3 = trunc <2 x i64> %tmp2 to <2 x i32>
+ ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @shrn16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
+;CHECK-LABEL: shrn16b:
+;CHECK: shrn2.16b v0, {{v[0-9]+}}, #1
+ %out = load <8 x i8>* %ret
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = lshr <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %tmp3 = trunc <8 x i16> %tmp2 to <8 x i8>
+ %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @shrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
+;CHECK-LABEL: shrn8h:
+;CHECK: shrn2.8h v0, {{v[0-9]+}}, #1
+ %out = load <4 x i16>* %ret
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = lshr <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
+ %tmp3 = trunc <4 x i32> %tmp2 to <4 x i16>
+ %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @shrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
+;CHECK-LABEL: shrn4s:
+;CHECK: shrn2.4s v0, {{v[0-9]+}}, #1
+ %out = load <2 x i32>* %ret
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = lshr <2 x i64> %tmp1, <i64 1, i64 1>
+ %tmp3 = trunc <2 x i64> %tmp2 to <2 x i32>
+ %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x i32> %tmp4
+}
+
+declare <8 x i8> @llvm.aarch64.neon.shrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.shrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.shrn.v2i32(<2 x i64>, i32) nounwind readnone
+
+define i32 @sqshrn1s(i64 %A) nounwind {
+; CHECK-LABEL: sqshrn1s:
+; CHECK: sqshrn {{s[0-9]+}}, d0, #1
+ %tmp = call i32 @llvm.aarch64.neon.sqshrn.i32(i64 %A, i32 1)
+ ret i32 %tmp
+}
+
+define <8 x i8> @sqshrn8b(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqshrn8b:
+;CHECK: sqshrn.8b v0, {{v[0-9]+}}, #1
+ %tmp1 = load <8 x i16>* %A
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> %tmp1, i32 1)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqshrn4h(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqshrn4h:
+;CHECK: sqshrn.4h v0, {{v[0-9]+}}, #1
+ %tmp1 = load <4 x i32>* %A
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> %tmp1, i32 1)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqshrn2s(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqshrn2s:
+;CHECK: sqshrn.2s v0, {{v[0-9]+}}, #1
+ %tmp1 = load <2 x i64>* %A
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> %tmp1, i32 1)
+ ret <2 x i32> %tmp3
+}
+
+
+define <16 x i8> @sqshrn16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqshrn16b:
+;CHECK: sqshrn2.16b v0, {{v[0-9]+}}, #1
+ %out = load <8 x i8>* %ret
+ %tmp1 = load <8 x i16>* %A
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> %tmp1, i32 1)
+ %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @sqshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqshrn8h:
+;CHECK: sqshrn2.8h v0, {{v[0-9]+}}, #1
+ %out = load <4 x i16>* %ret
+ %tmp1 = load <4 x i32>* %A
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> %tmp1, i32 1)
+ %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @sqshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqshrn4s:
+;CHECK: sqshrn2.4s v0, {{v[0-9]+}}, #1
+ %out = load <2 x i32>* %ret
+ %tmp1 = load <2 x i64>* %A
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> %tmp1, i32 1)
+ %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x i32> %tmp4
+}
+
+declare i32 @llvm.aarch64.neon.sqshrn.i32(i64, i32) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64>, i32) nounwind readnone
+
+define i32 @sqshrun1s(i64 %A) nounwind {
+; CHECK-LABEL: sqshrun1s:
+; CHECK: sqshrun {{s[0-9]+}}, d0, #1
+ %tmp = call i32 @llvm.aarch64.neon.sqshrun.i32(i64 %A, i32 1)
+ ret i32 %tmp
+}
+
+define <8 x i8> @sqshrun8b(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqshrun8b:
+;CHECK: sqshrun.8b v0, {{v[0-9]+}}, #1
+ %tmp1 = load <8 x i16>* %A
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> %tmp1, i32 1)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqshrun4h(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqshrun4h:
+;CHECK: sqshrun.4h v0, {{v[0-9]+}}, #1
+ %tmp1 = load <4 x i32>* %A
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> %tmp1, i32 1)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqshrun2s(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqshrun2s:
+;CHECK: sqshrun.2s v0, {{v[0-9]+}}, #1
+ %tmp1 = load <2 x i64>* %A
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> %tmp1, i32 1)
+ ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqshrun16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqshrun16b:
+;CHECK: sqshrun2.16b v0, {{v[0-9]+}}, #1
+ %out = load <8 x i8>* %ret
+ %tmp1 = load <8 x i16>* %A
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> %tmp1, i32 1)
+ %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @sqshrun8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqshrun8h:
+;CHECK: sqshrun2.8h v0, {{v[0-9]+}}, #1
+ %out = load <4 x i16>* %ret
+ %tmp1 = load <4 x i32>* %A
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> %tmp1, i32 1)
+ %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @sqshrun4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqshrun4s:
+;CHECK: sqshrun2.4s v0, {{v[0-9]+}}, #1
+ %out = load <2 x i32>* %ret
+ %tmp1 = load <2 x i64>* %A
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> %tmp1, i32 1)
+ %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x i32> %tmp4
+}
+
+declare i32 @llvm.aarch64.neon.sqshrun.i32(i64, i32) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64>, i32) nounwind readnone
+
+define i32 @sqrshrn1s(i64 %A) nounwind {
+; CHECK-LABEL: sqrshrn1s:
+; CHECK: sqrshrn {{s[0-9]+}}, d0, #1
+ %tmp = call i32 @llvm.aarch64.neon.sqrshrn.i32(i64 %A, i32 1)
+ ret i32 %tmp
+}
+
+define <8 x i8> @sqrshrn8b(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqrshrn8b:
+;CHECK: sqrshrn.8b v0, {{v[0-9]+}}, #1
+ %tmp1 = load <8 x i16>* %A
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqrshrn4h(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqrshrn4h:
+;CHECK: sqrshrn.4h v0, {{v[0-9]+}}, #1
+ %tmp1 = load <4 x i32>* %A
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqrshrn2s(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqrshrn2s:
+;CHECK: sqrshrn.2s v0, {{v[0-9]+}}, #1
+ %tmp1 = load <2 x i64>* %A
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
+ ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqrshrn16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqrshrn16b:
+;CHECK: sqrshrn2.16b v0, {{v[0-9]+}}, #1
+ %out = load <8 x i8>* %ret
+ %tmp1 = load <8 x i16>* %A
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
+ %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @sqrshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqrshrn8h:
+;CHECK: sqrshrn2.8h v0, {{v[0-9]+}}, #1
+ %out = load <4 x i16>* %ret
+ %tmp1 = load <4 x i32>* %A
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
+ %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @sqrshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqrshrn4s:
+;CHECK: sqrshrn2.4s v0, {{v[0-9]+}}, #1
+ %out = load <2 x i32>* %ret
+ %tmp1 = load <2 x i64>* %A
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
+ %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x i32> %tmp4
+}
+
+declare i32 @llvm.aarch64.neon.sqrshrn.i32(i64, i32) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64>, i32) nounwind readnone
+
+define i32 @sqrshrun1s(i64 %A) nounwind {
+; CHECK-LABEL: sqrshrun1s:
+; CHECK: sqrshrun {{s[0-9]+}}, d0, #1
+ %tmp = call i32 @llvm.aarch64.neon.sqrshrun.i32(i64 %A, i32 1)
+ ret i32 %tmp
+}
+
+define <8 x i8> @sqrshrun8b(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqrshrun8b:
+;CHECK: sqrshrun.8b v0, {{v[0-9]+}}, #1
+ %tmp1 = load <8 x i16>* %A
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> %tmp1, i32 1)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqrshrun4h(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqrshrun4h:
+;CHECK: sqrshrun.4h v0, {{v[0-9]+}}, #1
+ %tmp1 = load <4 x i32>* %A
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> %tmp1, i32 1)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqrshrun2s(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqrshrun2s:
+;CHECK: sqrshrun.2s v0, {{v[0-9]+}}, #1
+ %tmp1 = load <2 x i64>* %A
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> %tmp1, i32 1)
+ ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqrshrun16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqrshrun16b:
+;CHECK: sqrshrun2.16b v0, {{v[0-9]+}}, #1
+ %out = load <8 x i8>* %ret
+ %tmp1 = load <8 x i16>* %A
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> %tmp1, i32 1)
+ %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @sqrshrun8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqrshrun8h:
+;CHECK: sqrshrun2.8h v0, {{v[0-9]+}}, #1
+ %out = load <4 x i16>* %ret
+ %tmp1 = load <4 x i32>* %A
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> %tmp1, i32 1)
+ %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @sqrshrun4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqrshrun4s:
+;CHECK: sqrshrun2.4s v0, {{v[0-9]+}}, #1
+ %out = load <2 x i32>* %ret
+ %tmp1 = load <2 x i64>* %A
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> %tmp1, i32 1)
+ %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x i32> %tmp4
+}
+
+declare i32 @llvm.aarch64.neon.sqrshrun.i32(i64, i32) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64>, i32) nounwind readnone
+
+define i32 @uqrshrn1s(i64 %A) nounwind {
+; CHECK-LABEL: uqrshrn1s:
+; CHECK: uqrshrn {{s[0-9]+}}, d0, #1
+ %tmp = call i32 @llvm.aarch64.neon.uqrshrn.i32(i64 %A, i32 1)
+ ret i32 %tmp
+}
+
+define <8 x i8> @uqrshrn8b(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: uqrshrn8b:
+;CHECK: uqrshrn.8b v0, {{v[0-9]+}}, #1
+ %tmp1 = load <8 x i16>* %A
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @uqrshrn4h(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: uqrshrn4h:
+;CHECK: uqrshrn.4h v0, {{v[0-9]+}}, #1
+ %tmp1 = load <4 x i32>* %A
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uqrshrn2s(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: uqrshrn2s:
+;CHECK: uqrshrn.2s v0, {{v[0-9]+}}, #1
+ %tmp1 = load <2 x i64>* %A
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
+ ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @uqrshrn16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
+;CHECK-LABEL: uqrshrn16b:
+;CHECK: uqrshrn2.16b v0, {{v[0-9]+}}, #1
+ %out = load <8 x i8>* %ret
+ %tmp1 = load <8 x i16>* %A
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
+ %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @uqrshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
+;CHECK-LABEL: uqrshrn8h:
+;CHECK: uqrshrn2.8h v0, {{v[0-9]+}}, #1
+ %out = load <4 x i16>* %ret
+ %tmp1 = load <4 x i32>* %A
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
+ %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @uqrshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
+;CHECK-LABEL: uqrshrn4s:
+;CHECK: uqrshrn2.4s v0, {{v[0-9]+}}, #1
+ %out = load <2 x i32>* %ret
+ %tmp1 = load <2 x i64>* %A
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
+ %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x i32> %tmp4
+}
+
+declare i32 @llvm.aarch64.neon.uqrshrn.i32(i64, i32) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64>, i32) nounwind readnone
+
+define i32 @uqshrn1s(i64 %A) nounwind {
+; CHECK-LABEL: uqshrn1s:
+; CHECK: uqshrn {{s[0-9]+}}, d0, #1
+ %tmp = call i32 @llvm.aarch64.neon.uqshrn.i32(i64 %A, i32 1)
+ ret i32 %tmp
+}
+
+define <8 x i8> @uqshrn8b(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: uqshrn8b:
+;CHECK: uqshrn.8b v0, {{v[0-9]+}}, #1
+ %tmp1 = load <8 x i16>* %A
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> %tmp1, i32 1)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @uqshrn4h(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: uqshrn4h:
+;CHECK: uqshrn.4h v0, {{v[0-9]+}}, #1
+ %tmp1 = load <4 x i32>* %A
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> %tmp1, i32 1)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uqshrn2s(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: uqshrn2s:
+;CHECK: uqshrn.2s v0, {{v[0-9]+}}, #1
+ %tmp1 = load <2 x i64>* %A
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> %tmp1, i32 1)
+ ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @uqshrn16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
+;CHECK-LABEL: uqshrn16b:
+;CHECK: uqshrn2.16b v0, {{v[0-9]+}}, #1
+ %out = load <8 x i8>* %ret
+ %tmp1 = load <8 x i16>* %A
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> %tmp1, i32 1)
+ %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @uqshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
+;CHECK-LABEL: uqshrn8h:
+;CHECK: uqshrn2.8h v0, {{v[0-9]+}}, #1
+ %out = load <4 x i16>* %ret
+ %tmp1 = load <4 x i32>* %A
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> %tmp1, i32 1)
+ %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @uqshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
+;CHECK-LABEL: uqshrn4s:
+;CHECK: uqshrn2.4s v0, {{v[0-9]+}}, #1
+ %out = load <2 x i32>* %ret
+ %tmp1 = load <2 x i64>* %A
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> %tmp1, i32 1)
+ %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x i32> %tmp4
+}
+
+declare i32 @llvm.aarch64.neon.uqshrn.i32(i64, i32) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64>, i32) nounwind readnone
+
+define <8 x i16> @ushll8h(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: ushll8h:
+;CHECK: ushll.8h v0, {{v[0-9]+}}, #1
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = zext <8 x i8> %tmp1 to <8 x i16>
+ %tmp3 = shl <8 x i16> %tmp2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @ushll4s(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: ushll4s:
+;CHECK: ushll.4s v0, {{v[0-9]+}}, #1
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = zext <4 x i16> %tmp1 to <4 x i32>
+ %tmp3 = shl <4 x i32> %tmp2, <i32 1, i32 1, i32 1, i32 1>
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @ushll2d(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: ushll2d:
+;CHECK: ushll.2d v0, {{v[0-9]+}}, #1
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = zext <2 x i32> %tmp1 to <2 x i64>
+ %tmp3 = shl <2 x i64> %tmp2, <i64 1, i64 1>
+ ret <2 x i64> %tmp3
+}
+
+define <8 x i16> @ushll2_8h(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: ushll2_8h:
+;CHECK: ushll2.8h v0, {{v[0-9]+}}, #1
+ %load1 = load <16 x i8>* %A
+ %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %tmp2 = zext <8 x i8> %tmp1 to <8 x i16>
+ %tmp3 = shl <8 x i16> %tmp2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @ushll2_4s(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: ushll2_4s:
+;CHECK: ushll2.4s v0, {{v[0-9]+}}, #1
+ %load1 = load <8 x i16>* %A
+ %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %tmp2 = zext <4 x i16> %tmp1 to <4 x i32>
+ %tmp3 = shl <4 x i32> %tmp2, <i32 1, i32 1, i32 1, i32 1>
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @ushll2_2d(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: ushll2_2d:
+;CHECK: ushll2.2d v0, {{v[0-9]+}}, #1
+ %load1 = load <4 x i32>* %A
+ %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ %tmp2 = zext <2 x i32> %tmp1 to <2 x i64>
+ %tmp3 = shl <2 x i64> %tmp2, <i64 1, i64 1>
+ ret <2 x i64> %tmp3
+}
+
+define <8 x i16> @sshll8h(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: sshll8h:
+;CHECK: sshll.8h v0, {{v[0-9]+}}, #1
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = sext <8 x i8> %tmp1 to <8 x i16>
+ %tmp3 = shl <8 x i16> %tmp2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @sshll4s(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: sshll4s:
+;CHECK: sshll.4s v0, {{v[0-9]+}}, #1
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = sext <4 x i16> %tmp1 to <4 x i32>
+ %tmp3 = shl <4 x i32> %tmp2, <i32 1, i32 1, i32 1, i32 1>
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sshll2d(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: sshll2d:
+;CHECK: sshll.2d v0, {{v[0-9]+}}, #1
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = sext <2 x i32> %tmp1 to <2 x i64>
+ %tmp3 = shl <2 x i64> %tmp2, <i64 1, i64 1>
+ ret <2 x i64> %tmp3
+}
+
+define <8 x i16> @sshll2_8h(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: sshll2_8h:
+;CHECK: sshll2.8h v0, {{v[0-9]+}}, #1
+ %load1 = load <16 x i8>* %A
+ %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %tmp2 = sext <8 x i8> %tmp1 to <8 x i16>
+ %tmp3 = shl <8 x i16> %tmp2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @sshll2_4s(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: sshll2_4s:
+;CHECK: sshll2.4s v0, {{v[0-9]+}}, #1
+ %load1 = load <8 x i16>* %A
+ %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %tmp2 = sext <4 x i16> %tmp1 to <4 x i32>
+ %tmp3 = shl <4 x i32> %tmp2, <i32 1, i32 1, i32 1, i32 1>
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sshll2_2d(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: sshll2_2d:
+;CHECK: sshll2.2d v0, {{v[0-9]+}}, #1
+ %load1 = load <4 x i32>* %A
+ %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ %tmp2 = sext <2 x i32> %tmp1 to <2 x i64>
+ %tmp3 = shl <2 x i64> %tmp2, <i64 1, i64 1>
+ ret <2 x i64> %tmp3
+}
+
+define <8 x i8> @sqshli8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: sqshli8b:
+;CHECK: sqshl.8b v0, {{v[0-9]+}}, #1
+ %tmp1 = load <8 x i8>* %A
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqshli4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: sqshli4h:
+;CHECK: sqshl.4h v0, {{v[0-9]+}}, #1
+ %tmp1 = load <4 x i16>* %A
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqshli2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: sqshli2s:
+;CHECK: sqshl.2s v0, {{v[0-9]+}}, #1
+ %tmp1 = load <2 x i32>* %A
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 1, i32 1>)
+ ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqshli16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: sqshli16b:
+;CHECK: sqshl.16b v0, {{v[0-9]+}}, #1
+ %tmp1 = load <16 x i8>* %A
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+ ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @sqshli8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqshli8h:
+;CHECK: sqshl.8h v0, {{v[0-9]+}}, #1
+ %tmp1 = load <8 x i16>* %A
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @sqshli4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqshli4s:
+;CHECK: sqshl.4s v0, {{v[0-9]+}}, #1
+ %tmp1 = load <4 x i32>* %A
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sqshli2d(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqshli2d:
+;CHECK: sqshl.2d v0, {{v[0-9]+}}, #1
+ %tmp1 = load <2 x i64>* %A
+ %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 1, i64 1>)
+ ret <2 x i64> %tmp3
+}
+
+define <8 x i8> @uqshli8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: uqshli8b:
+;CHECK: uqshl.8b v0, {{v[0-9]+}}, #1
+ %tmp1 = load <8 x i8>* %A
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @uqshli4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: uqshli4h:
+;CHECK: uqshl.4h v0, {{v[0-9]+}}, #1
+ %tmp1 = load <4 x i16>* %A
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uqshli2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: uqshli2s:
+;CHECK: uqshl.2s v0, {{v[0-9]+}}, #1
+ %tmp1 = load <2 x i32>* %A
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 1, i32 1>)
+ ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @uqshli16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: uqshli16b:
+;CHECK: uqshl.16b
+ %tmp1 = load <16 x i8>* %A
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+ ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @uqshli8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: uqshli8h:
+;CHECK: uqshl.8h v0, {{v[0-9]+}}, #1
+ %tmp1 = load <8 x i16>* %A
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @uqshli4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: uqshli4s:
+;CHECK: uqshl.4s v0, {{v[0-9]+}}, #1
+ %tmp1 = load <4 x i32>* %A
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @uqshli2d(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: uqshli2d:
+;CHECK: uqshl.2d v0, {{v[0-9]+}}, #1
+ %tmp1 = load <2 x i64>* %A
+ %tmp3 = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 1, i64 1>)
+ ret <2 x i64> %tmp3
+}
+
+define <8 x i8> @ursra8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: ursra8b:
+;CHECK: ursra.8b v0, {{v[0-9]+}}, #1
+ %tmp1 = load <8 x i8>* %A
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+ %tmp4 = load <8 x i8>* %B
+ %tmp5 = add <8 x i8> %tmp3, %tmp4
+ ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @ursra4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: ursra4h:
+;CHECK: ursra.4h v0, {{v[0-9]+}}, #1
+ %tmp1 = load <4 x i16>* %A
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+ %tmp4 = load <4 x i16>* %B
+ %tmp5 = add <4 x i16> %tmp3, %tmp4
+ ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @ursra2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: ursra2s:
+;CHECK: ursra.2s v0, {{v[0-9]+}}, #1
+ %tmp1 = load <2 x i32>* %A
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
+ %tmp4 = load <2 x i32>* %B
+ %tmp5 = add <2 x i32> %tmp3, %tmp4
+ ret <2 x i32> %tmp5
+}
+
+define <16 x i8> @ursra16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: ursra16b:
+;CHECK: ursra.16b v0, {{v[0-9]+}}, #1
+ %tmp1 = load <16 x i8>* %A
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+ %tmp4 = load <16 x i8>* %B
+ %tmp5 = add <16 x i8> %tmp3, %tmp4
+ ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @ursra8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: ursra8h:
+;CHECK: ursra.8h v0, {{v[0-9]+}}, #1
+ %tmp1 = load <8 x i16>* %A
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+ %tmp4 = load <8 x i16>* %B
+ %tmp5 = add <8 x i16> %tmp3, %tmp4
+ ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @ursra4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: ursra4s:
+;CHECK: ursra.4s v0, {{v[0-9]+}}, #1
+ %tmp1 = load <4 x i32>* %A
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+ %tmp4 = load <4 x i32>* %B
+ %tmp5 = add <4 x i32> %tmp3, %tmp4
+ ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @ursra2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: ursra2d:
+;CHECK: ursra.2d v0, {{v[0-9]+}}, #1
+ %tmp1 = load <2 x i64>* %A
+ %tmp3 = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
+ %tmp4 = load <2 x i64>* %B
+ %tmp5 = add <2 x i64> %tmp3, %tmp4
+ ret <2 x i64> %tmp5
+}
+
+define <8 x i8> @srsra8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: srsra8b:
+;CHECK: srsra.8b v0, {{v[0-9]+}}, #1
+ %tmp1 = load <8 x i8>* %A
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+ %tmp4 = load <8 x i8>* %B
+ %tmp5 = add <8 x i8> %tmp3, %tmp4
+ ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @srsra4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: srsra4h:
+;CHECK: srsra.4h v0, {{v[0-9]+}}, #1
+ %tmp1 = load <4 x i16>* %A
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+ %tmp4 = load <4 x i16>* %B
+ %tmp5 = add <4 x i16> %tmp3, %tmp4
+ ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @srsra2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: srsra2s:
+;CHECK: srsra.2s v0, {{v[0-9]+}}, #1
+ %tmp1 = load <2 x i32>* %A
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
+ %tmp4 = load <2 x i32>* %B
+ %tmp5 = add <2 x i32> %tmp3, %tmp4
+ ret <2 x i32> %tmp5
+}
+
+define <16 x i8> @srsra16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: srsra16b:
+;CHECK: srsra.16b v0, {{v[0-9]+}}, #1
+ %tmp1 = load <16 x i8>* %A
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+ %tmp4 = load <16 x i8>* %B
+ %tmp5 = add <16 x i8> %tmp3, %tmp4
+ ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @srsra8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: srsra8h:
+;CHECK: srsra.8h v0, {{v[0-9]+}}, #1
+ %tmp1 = load <8 x i16>* %A
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+ %tmp4 = load <8 x i16>* %B
+ %tmp5 = add <8 x i16> %tmp3, %tmp4
+ ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @srsra4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: srsra4s:
+;CHECK: srsra.4s v0, {{v[0-9]+}}, #1
+ %tmp1 = load <4 x i32>* %A
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+ %tmp4 = load <4 x i32>* %B
+ %tmp5 = add <4 x i32> %tmp3, %tmp4
+ ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @srsra2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: srsra2d:
+;CHECK: srsra.2d v0, {{v[0-9]+}}, #1
+ %tmp1 = load <2 x i64>* %A
+ %tmp3 = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
+ %tmp4 = load <2 x i64>* %B
+ %tmp5 = add <2 x i64> %tmp3, %tmp4
+ ret <2 x i64> %tmp5
+}
+
+define <8 x i8> @usra8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: usra8b:
+;CHECK: usra.8b v0, {{v[0-9]+}}, #1
+ %tmp1 = load <8 x i8>* %A
+ %tmp3 = lshr <8 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ %tmp4 = load <8 x i8>* %B
+ %tmp5 = add <8 x i8> %tmp3, %tmp4
+ ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @usra4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: usra4h:
+;CHECK: usra.4h v0, {{v[0-9]+}}, #1
+ %tmp1 = load <4 x i16>* %A
+ %tmp3 = lshr <4 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1>
+ %tmp4 = load <4 x i16>* %B
+ %tmp5 = add <4 x i16> %tmp3, %tmp4
+ ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @usra2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: usra2s:
+;CHECK: usra.2s v0, {{v[0-9]+}}, #1
+ %tmp1 = load <2 x i32>* %A
+ %tmp3 = lshr <2 x i32> %tmp1, <i32 1, i32 1>
+ %tmp4 = load <2 x i32>* %B
+ %tmp5 = add <2 x i32> %tmp3, %tmp4
+ ret <2 x i32> %tmp5
+}
+
+define <16 x i8> @usra16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: usra16b:
+;CHECK: usra.16b v0, {{v[0-9]+}}, #1
+ %tmp1 = load <16 x i8>* %A
+ %tmp3 = lshr <16 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ %tmp4 = load <16 x i8>* %B
+ %tmp5 = add <16 x i8> %tmp3, %tmp4
+ ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @usra8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: usra8h:
+;CHECK: usra.8h v0, {{v[0-9]+}}, #1
+ %tmp1 = load <8 x i16>* %A
+ %tmp3 = lshr <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %tmp4 = load <8 x i16>* %B
+ %tmp5 = add <8 x i16> %tmp3, %tmp4
+ ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @usra4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: usra4s:
+;CHECK: usra.4s v0, {{v[0-9]+}}, #1
+ %tmp1 = load <4 x i32>* %A
+ %tmp3 = lshr <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
+ %tmp4 = load <4 x i32>* %B
+ %tmp5 = add <4 x i32> %tmp3, %tmp4
+ ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @usra2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: usra2d:
+;CHECK: usra.2d v0, {{v[0-9]+}}, #1
+ %tmp1 = load <2 x i64>* %A
+ %tmp3 = lshr <2 x i64> %tmp1, <i64 1, i64 1>
+ %tmp4 = load <2 x i64>* %B
+ %tmp5 = add <2 x i64> %tmp3, %tmp4
+ ret <2 x i64> %tmp5
+}
+
+define <8 x i8> @ssra8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: ssra8b:
+;CHECK: ssra.8b v0, {{v[0-9]+}}, #1
+ %tmp1 = load <8 x i8>* %A
+ %tmp3 = ashr <8 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ %tmp4 = load <8 x i8>* %B
+ %tmp5 = add <8 x i8> %tmp3, %tmp4
+ ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @ssra4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: ssra4h:
+;CHECK: ssra.4h v0, {{v[0-9]+}}, #1
+ %tmp1 = load <4 x i16>* %A
+ %tmp3 = ashr <4 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1>
+ %tmp4 = load <4 x i16>* %B
+ %tmp5 = add <4 x i16> %tmp3, %tmp4
+ ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @ssra2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: ssra2s:
+;CHECK: ssra.2s v0, {{v[0-9]+}}, #1
+ %tmp1 = load <2 x i32>* %A
+ %tmp3 = ashr <2 x i32> %tmp1, <i32 1, i32 1>
+ %tmp4 = load <2 x i32>* %B
+ %tmp5 = add <2 x i32> %tmp3, %tmp4
+ ret <2 x i32> %tmp5
+}
+
+define <16 x i8> @ssra16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: ssra16b:
+;CHECK: ssra.16b v0, {{v[0-9]+}}, #1
+ %tmp1 = load <16 x i8>* %A
+ %tmp3 = ashr <16 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ %tmp4 = load <16 x i8>* %B
+ %tmp5 = add <16 x i8> %tmp3, %tmp4
+ ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @ssra8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: ssra8h:
+;CHECK: ssra.8h v0, {{v[0-9]+}}, #1
+ %tmp1 = load <8 x i16>* %A
+ %tmp3 = ashr <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %tmp4 = load <8 x i16>* %B
+ %tmp5 = add <8 x i16> %tmp3, %tmp4
+ ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @ssra4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: ssra4s:
+;CHECK: ssra.4s v0, {{v[0-9]+}}, #1
+ %tmp1 = load <4 x i32>* %A
+ %tmp3 = ashr <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
+ %tmp4 = load <4 x i32>* %B
+ %tmp5 = add <4 x i32> %tmp3, %tmp4
+ ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @ssra2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: ssra2d:
+;CHECK: ssra.2d v0, {{v[0-9]+}}, #1
+ %tmp1 = load <2 x i64>* %A
+ %tmp3 = ashr <2 x i64> %tmp1, <i64 1, i64 1>
+ %tmp4 = load <2 x i64>* %B
+ %tmp5 = add <2 x i64> %tmp3, %tmp4
+ ret <2 x i64> %tmp5
+}
+
+define <8 x i8> @shr_orr8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: shr_orr8b:
+;CHECK: shr.8b v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.8b
+;CHECK-NEXT: ret
+ %tmp1 = load <8 x i8>* %A
+ %tmp4 = load <8 x i8>* %B
+ %tmp3 = lshr <8 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ %tmp5 = or <8 x i8> %tmp3, %tmp4
+ ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @shr_orr4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: shr_orr4h:
+;CHECK: shr.4h v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.8b
+;CHECK-NEXT: ret
+ %tmp1 = load <4 x i16>* %A
+ %tmp4 = load <4 x i16>* %B
+ %tmp3 = lshr <4 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1>
+ %tmp5 = or <4 x i16> %tmp3, %tmp4
+ ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @shr_orr2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: shr_orr2s:
+;CHECK: shr.2s v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.8b
+;CHECK-NEXT: ret
+ %tmp1 = load <2 x i32>* %A
+ %tmp4 = load <2 x i32>* %B
+ %tmp3 = lshr <2 x i32> %tmp1, <i32 1, i32 1>
+ %tmp5 = or <2 x i32> %tmp3, %tmp4
+ ret <2 x i32> %tmp5
+}
+
+define <16 x i8> @shr_orr16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: shr_orr16b:
+;CHECK: shr.16b v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.16b
+;CHECK-NEXT: ret
+ %tmp1 = load <16 x i8>* %A
+ %tmp4 = load <16 x i8>* %B
+ %tmp3 = lshr <16 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ %tmp5 = or <16 x i8> %tmp3, %tmp4
+ ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @shr_orr8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: shr_orr8h:
+;CHECK: shr.8h v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.16b
+;CHECK-NEXT: ret
+ %tmp1 = load <8 x i16>* %A
+ %tmp4 = load <8 x i16>* %B
+ %tmp3 = lshr <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %tmp5 = or <8 x i16> %tmp3, %tmp4
+ ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @shr_orr4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: shr_orr4s:
+;CHECK: shr.4s v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.16b
+;CHECK-NEXT: ret
+ %tmp1 = load <4 x i32>* %A
+ %tmp4 = load <4 x i32>* %B
+ %tmp3 = lshr <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
+ %tmp5 = or <4 x i32> %tmp3, %tmp4
+ ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @shr_orr2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: shr_orr2d:
+;CHECK: shr.2d v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.16b
+;CHECK-NEXT: ret
+ %tmp1 = load <2 x i64>* %A
+ %tmp4 = load <2 x i64>* %B
+ %tmp3 = lshr <2 x i64> %tmp1, <i64 1, i64 1>
+ %tmp5 = or <2 x i64> %tmp3, %tmp4
+ ret <2 x i64> %tmp5
+}
+
+define <8 x i8> @shl_orr8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: shl_orr8b:
+;CHECK: shl.8b v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.8b
+;CHECK-NEXT: ret
+ %tmp1 = load <8 x i8>* %A
+ %tmp4 = load <8 x i8>* %B
+ %tmp3 = shl <8 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ %tmp5 = or <8 x i8> %tmp3, %tmp4
+ ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @shl_orr4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: shl_orr4h:
+;CHECK: shl.4h v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.8b
+;CHECK-NEXT: ret
+ %tmp1 = load <4 x i16>* %A
+ %tmp4 = load <4 x i16>* %B
+ %tmp3 = shl <4 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1>
+ %tmp5 = or <4 x i16> %tmp3, %tmp4
+ ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @shl_orr2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: shl_orr2s:
+;CHECK: shl.2s v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.8b
+;CHECK-NEXT: ret
+ %tmp1 = load <2 x i32>* %A
+ %tmp4 = load <2 x i32>* %B
+ %tmp3 = shl <2 x i32> %tmp1, <i32 1, i32 1>
+ %tmp5 = or <2 x i32> %tmp3, %tmp4
+ ret <2 x i32> %tmp5
+}
+
+define <16 x i8> @shl_orr16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: shl_orr16b:
+;CHECK: shl.16b v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.16b
+;CHECK-NEXT: ret
+ %tmp1 = load <16 x i8>* %A
+ %tmp4 = load <16 x i8>* %B
+ %tmp3 = shl <16 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ %tmp5 = or <16 x i8> %tmp3, %tmp4
+ ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @shl_orr8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: shl_orr8h:
+;CHECK: shl.8h v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.16b
+;CHECK-NEXT: ret
+ %tmp1 = load <8 x i16>* %A
+ %tmp4 = load <8 x i16>* %B
+ %tmp3 = shl <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %tmp5 = or <8 x i16> %tmp3, %tmp4
+ ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @shl_orr4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: shl_orr4s:
+;CHECK: shl.4s v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.16b
+;CHECK-NEXT: ret
+ %tmp1 = load <4 x i32>* %A
+ %tmp4 = load <4 x i32>* %B
+ %tmp3 = shl <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
+ %tmp5 = or <4 x i32> %tmp3, %tmp4
+ ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @shl_orr2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: shl_orr2d:
+;CHECK: shl.2d v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.16b
+;CHECK-NEXT: ret
+ %tmp1 = load <2 x i64>* %A
+ %tmp4 = load <2 x i64>* %B
+ %tmp3 = shl <2 x i64> %tmp1, <i64 1, i64 1>
+ %tmp5 = or <2 x i64> %tmp3, %tmp4
+ ret <2 x i64> %tmp5
+}
+
+define <8 x i16> @shll(<8 x i8> %in) {
+; CHECK-LABEL: shll:
+; CHECK: shll.8h v0, {{v[0-9]+}}, #8
+ %ext = zext <8 x i8> %in to <8 x i16>
+ %res = shl <8 x i16> %ext, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+ ret <8 x i16> %res
+}
+
+define <4 x i32> @shll_high(<8 x i16> %in) {
+; CHECK-LABEL: shll_high
+; CHECK: shll2.4s v0, {{v[0-9]+}}, #16
+ %extract = shufflevector <8 x i16> %in, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %ext = zext <4 x i16> %extract to <4 x i32>
+ %res = shl <4 x i32> %ext, <i32 16, i32 16, i32 16, i32 16>
+ ret <4 x i32> %res
+}
+
+define <8 x i8> @sli8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: sli8b:
+;CHECK: sli.8b v0, {{v[0-9]+}}, #1
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2, i32 1)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sli4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sli4h:
+;CHECK: sli.4h v0, {{v[0-9]+}}, #1
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2, i32 1)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sli2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sli2s:
+;CHECK: sli.2s v0, {{v[0-9]+}}, #1
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2, i32 1)
+ ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @sli1d(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+;CHECK-LABEL: sli1d:
+;CHECK: sli d0, {{d[0-9]+}}, #1
+ %tmp1 = load <1 x i64>* %A
+ %tmp2 = load <1 x i64>* %B
+ %tmp3 = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2, i32 1)
+ ret <1 x i64> %tmp3
+}
+
+define <16 x i8> @sli16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: sli16b:
+;CHECK: sli.16b v0, {{v[0-9]+}}, #1
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2, i32 1)
+ ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @sli8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sli8h:
+;CHECK: sli.8h v0, {{v[0-9]+}}, #1
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2, i32 1)
+ ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @sli4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sli4s:
+;CHECK: sli.4s v0, {{v[0-9]+}}, #1
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2, i32 1)
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sli2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: sli2d:
+;CHECK: sli.2d v0, {{v[0-9]+}}, #1
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i64>* %B
+ %tmp3 = call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2, i32 1)
+ ret <2 x i64> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8>, <8 x i8>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16>, <4 x i16>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32>, <2 x i32>, i32) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64>, <1 x i64>, i32) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8>, <16 x i8>, i32) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16>, <8 x i16>, i32) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32>, <4 x i32>, i32) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64>, <2 x i64>, i32) nounwind readnone
+
+define <1 x i64> @ashr_v1i64(<1 x i64> %a, <1 x i64> %b) {
+; CHECK-LABEL: ashr_v1i64:
+; CHECK: neg d{{[0-9]+}}, d{{[0-9]+}}
+; CHECK: sshl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+ %c = ashr <1 x i64> %a, %b
+ ret <1 x i64> %c
+}
diff --git a/test/CodeGen/AArch64/arm64-vshr.ll b/test/CodeGen/AArch64/arm64-vshr.ll
new file mode 100644
index 0000000..21eb579
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vshr.ll
@@ -0,0 +1,63 @@
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s -mcpu=cyclone | FileCheck %s
+
+define <8 x i16> @testShiftRightArith_v8i16(<8 x i16> %a, <8 x i16> %b) #0 {
+; CHECK-LABEL: testShiftRightArith_v8i16:
+; CHECK: neg.8h [[REG1:v[0-9]+]], [[REG1]]
+; CHECK-NEXT: sshl.8h [[REG2:v[0-9]+]], [[REG2]], [[REG1]]
+
+entry:
+ %a.addr = alloca <8 x i16>, align 16
+ %b.addr = alloca <8 x i16>, align 16
+ store <8 x i16> %a, <8 x i16>* %a.addr, align 16
+ store <8 x i16> %b, <8 x i16>* %b.addr, align 16
+ %0 = load <8 x i16>* %a.addr, align 16
+ %1 = load <8 x i16>* %b.addr, align 16
+ %shr = ashr <8 x i16> %0, %1
+ ret <8 x i16> %shr
+}
+
+define <4 x i32> @testShiftRightArith_v4i32(<4 x i32> %a, <4 x i32> %b) #0 {
+; CHECK-LABEL: testShiftRightArith_v4i32:
+; CHECK: neg.4s [[REG3:v[0-9]+]], [[REG3]]
+; CHECK-NEXT: sshl.4s [[REG4:v[0-9]+]], [[REG4]], [[REG3]]
+entry:
+ %a.addr = alloca <4 x i32>, align 32
+ %b.addr = alloca <4 x i32>, align 32
+ store <4 x i32> %a, <4 x i32>* %a.addr, align 32
+ store <4 x i32> %b, <4 x i32>* %b.addr, align 32
+ %0 = load <4 x i32>* %a.addr, align 32
+ %1 = load <4 x i32>* %b.addr, align 32
+ %shr = ashr <4 x i32> %0, %1
+ ret <4 x i32> %shr
+}
+
+define <8 x i16> @testShiftRightLogical(<8 x i16> %a, <8 x i16> %b) #0 {
+; CHECK: testShiftRightLogical
+; CHECK: neg.8h [[REG5:v[0-9]+]], [[REG5]]
+; CHECK-NEXT: ushl.8h [[REG6:v[0-9]+]], [[REG6]], [[REG5]]
+entry:
+ %a.addr = alloca <8 x i16>, align 16
+ %b.addr = alloca <8 x i16>, align 16
+ store <8 x i16> %a, <8 x i16>* %a.addr, align 16
+ store <8 x i16> %b, <8 x i16>* %b.addr, align 16
+ %0 = load <8 x i16>* %a.addr, align 16
+ %1 = load <8 x i16>* %b.addr, align 16
+ %shr = lshr <8 x i16> %0, %1
+ ret <8 x i16> %shr
+}
+
+define <1 x i64> @sshr_v1i64(<1 x i64> %A) nounwind {
+; CHECK-LABEL: sshr_v1i64:
+; CHECK: sshr d0, d0, #63
+ %tmp3 = ashr <1 x i64> %A, < i64 63 >
+ ret <1 x i64> %tmp3
+}
+
+define <1 x i64> @ushr_v1i64(<1 x i64> %A) nounwind {
+; CHECK-LABEL: ushr_v1i64:
+; CHECK: ushr d0, d0, #63
+ %tmp3 = lshr <1 x i64> %A, < i64 63 >
+ ret <1 x i64> %tmp3
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AArch64/arm64-vshuffle.ll b/test/CodeGen/AArch64/arm64-vshuffle.ll
new file mode 100644
index 0000000..62fd961
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vshuffle.ll
@@ -0,0 +1,115 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -mcpu=cyclone | FileCheck %s
+
+
+; The mask:
+; CHECK: lCPI0_0:
+; CHECK: .byte 2 ; 0x2
+; CHECK: .byte 255 ; 0xff
+; CHECK: .byte 6 ; 0x6
+; CHECK: .byte 255 ; 0xff
+; The second vector is legalized to undef and the elements of the first vector
+; are used instead.
+; CHECK: .byte 2 ; 0x2
+; CHECK: .byte 4 ; 0x4
+; CHECK: .byte 6 ; 0x6
+; CHECK: .byte 0 ; 0x0
+; CHECK: test1
+; CHECK: ldr d[[REG0:[0-9]+]], [{{.*}}, lCPI0_0
+; CHECK: movi.8h v[[REG1:[0-9]+]], #0x1, lsl #8
+; CHECK: tbl.8b v{{[0-9]+}}, { v[[REG1]] }, v[[REG0]]
+define <8 x i1> @test1() {
+entry:
+ %Shuff = shufflevector <8 x i1> <i1 0, i1 1, i1 2, i1 3, i1 4, i1 5, i1 6,
+ i1 7>,
+ <8 x i1> <i1 0, i1 1, i1 2, i1 3, i1 4, i1 5, i1 6,
+ i1 7>,
+ <8 x i32> <i32 2, i32 undef, i32 6, i32 undef, i32 10,
+ i32 12, i32 14, i32 0>
+ ret <8 x i1> %Shuff
+}
+
+; CHECK: lCPI1_0:
+; CHECK: .byte 2 ; 0x2
+; CHECK: .byte 255 ; 0xff
+; CHECK: .byte 6 ; 0x6
+; CHECK: .byte 255 ; 0xff
+; CHECK: .byte 10 ; 0xa
+; CHECK: .byte 12 ; 0xc
+; CHECK: .byte 14 ; 0xe
+; CHECK: .byte 0 ; 0x0
+; CHECK: test2
+; CHECK: ldr d[[REG0:[0-9]+]], [{{.*}}, lCPI1_0@PAGEOFF]
+; CHECK: adrp x[[REG2:[0-9]+]], lCPI1_1@PAGE
+; CHECK: ldr q[[REG1:[0-9]+]], [x[[REG2]], lCPI1_1@PAGEOFF]
+; CHECK: tbl.8b v{{[0-9]+}}, { v[[REG1]] }, v[[REG0]]
+define <8 x i1>@test2() {
+bb:
+ %Shuff = shufflevector <8 x i1> zeroinitializer,
+ <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0>,
+ <8 x i32> <i32 2, i32 undef, i32 6, i32 undef, i32 10, i32 12, i32 14,
+ i32 0>
+ ret <8 x i1> %Shuff
+}
+
+; CHECK: lCPI2_0:
+; CHECK: .byte 2 ; 0x2
+; CHECK: .byte 255 ; 0xff
+; CHECK: .byte 6 ; 0x6
+; CHECK: .byte 255 ; 0xff
+; CHECK: .byte 10 ; 0xa
+; CHECK: .byte 12 ; 0xc
+; CHECK: .byte 14 ; 0xe
+; CHECK: .byte 0 ; 0x0
+; CHECK: .byte 2 ; 0x2
+; CHECK: .byte 255 ; 0xff
+; CHECK: .byte 6 ; 0x6
+; CHECK: .byte 255 ; 0xff
+; CHECK: .byte 10 ; 0xa
+; CHECK: .byte 12 ; 0xc
+; CHECK: .byte 14 ; 0xe
+; CHECK: .byte 0 ; 0x0
+; CHECK: test3
+; CHECK: adrp x[[REG3:[0-9]+]], lCPI2_0@PAGE
+; CHECK: ldr q[[REG0:[0-9]+]], [x[[REG3]], lCPI2_0@PAGEOFF]
+; CHECK: ldr q[[REG1:[0-9]+]], [x[[REG3]], lCPI2_1@PAGEOFF]
+; CHECK: tbl.16b v{{[0-9]+}}, { v[[REG1]] }, v[[REG0]]
+define <16 x i1> @test3(i1* %ptr, i32 %v) {
+bb:
+ %Shuff = shufflevector <16 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0>, <16 x i1> undef,
+ <16 x i32> <i32 2, i32 undef, i32 6, i32 undef, i32 10, i32 12, i32 14,
+ i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 10, i32 12,
+ i32 14, i32 0>
+ ret <16 x i1> %Shuff
+}
+; CHECK: lCPI3_1:
+; CHECK: .byte 2 ; 0x2
+; CHECK: .byte 1 ; 0x1
+; CHECK: .byte 6 ; 0x6
+; CHECK: .byte 18 ; 0x12
+; CHECK: .byte 10 ; 0xa
+; CHECK: .byte 12 ; 0xc
+; CHECK: .byte 14 ; 0xe
+; CHECK: .byte 0 ; 0x0
+; CHECK: .byte 2 ; 0x2
+; CHECK: .byte 31 ; 0x1f
+; CHECK: .byte 6 ; 0x6
+; CHECK: .byte 30 ; 0x1e
+; CHECK: .byte 10 ; 0xa
+; CHECK: .byte 12 ; 0xc
+; CHECK: .byte 14 ; 0xe
+; CHECK: .byte 0 ; 0x0
+; CHECK: _test4:
+; CHECK: ldr q[[REG1:[0-9]+]]
+; CHECK: movi.2d v[[REG0:[0-9]+]], #0000000000000000
+; CHECK: adrp x[[REG3:[0-9]+]], lCPI3_1@PAGE
+; CHECK: ldr q[[REG2:[0-9]+]], [x[[REG3]], lCPI3_1@PAGEOFF]
+; CHECK: tbl.16b v{{[0-9]+}}, { v[[REG0]], v[[REG1]] }, v[[REG2]]
+define <16 x i1> @test4(i1* %ptr, i32 %v) {
+bb:
+ %Shuff = shufflevector <16 x i1> zeroinitializer,
+ <16 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1,
+ i1 1, i1 0, i1 0, i1 1, i1 0, i1 0>,
+ <16 x i32> <i32 2, i32 1, i32 6, i32 18, i32 10, i32 12, i32 14, i32 0,
+ i32 2, i32 31, i32 6, i32 30, i32 10, i32 12, i32 14, i32 0>
+ ret <16 x i1> %Shuff
+}
diff --git a/test/CodeGen/AArch64/arm64-vsqrt.ll b/test/CodeGen/AArch64/arm64-vsqrt.ll
new file mode 100644
index 0000000..02b7c7e
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vsqrt.ll
@@ -0,0 +1,232 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <2 x float> @frecps_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: frecps_2s:
+;CHECK: frecps.2s
+ %tmp1 = load <2 x float>* %A
+ %tmp2 = load <2 x float>* %B
+ %tmp3 = call <2 x float> @llvm.aarch64.neon.frecps.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+ ret <2 x float> %tmp3
+}
+
+define <4 x float> @frecps_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: frecps_4s:
+;CHECK: frecps.4s
+ %tmp1 = load <4 x float>* %A
+ %tmp2 = load <4 x float>* %B
+ %tmp3 = call <4 x float> @llvm.aarch64.neon.frecps.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+ ret <4 x float> %tmp3
+}
+
+define <2 x double> @frecps_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: frecps_2d:
+;CHECK: frecps.2d
+ %tmp1 = load <2 x double>* %A
+ %tmp2 = load <2 x double>* %B
+ %tmp3 = call <2 x double> @llvm.aarch64.neon.frecps.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+ ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.frecps.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.frecps.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.frecps.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+
+define <2 x float> @frsqrts_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: frsqrts_2s:
+;CHECK: frsqrts.2s
+ %tmp1 = load <2 x float>* %A
+ %tmp2 = load <2 x float>* %B
+ %tmp3 = call <2 x float> @llvm.aarch64.neon.frsqrts.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+ ret <2 x float> %tmp3
+}
+
+define <4 x float> @frsqrts_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: frsqrts_4s:
+;CHECK: frsqrts.4s
+ %tmp1 = load <4 x float>* %A
+ %tmp2 = load <4 x float>* %B
+ %tmp3 = call <4 x float> @llvm.aarch64.neon.frsqrts.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+ ret <4 x float> %tmp3
+}
+
+define <2 x double> @frsqrts_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: frsqrts_2d:
+;CHECK: frsqrts.2d
+ %tmp1 = load <2 x double>* %A
+ %tmp2 = load <2 x double>* %B
+ %tmp3 = call <2 x double> @llvm.aarch64.neon.frsqrts.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+ ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.frsqrts.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.frsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.frsqrts.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x float> @frecpe_2s(<2 x float>* %A) nounwind {
+;CHECK-LABEL: frecpe_2s:
+;CHECK: frecpe.2s
+ %tmp1 = load <2 x float>* %A
+ %tmp3 = call <2 x float> @llvm.aarch64.neon.frecpe.v2f32(<2 x float> %tmp1)
+ ret <2 x float> %tmp3
+}
+
+define <4 x float> @frecpe_4s(<4 x float>* %A) nounwind {
+;CHECK-LABEL: frecpe_4s:
+;CHECK: frecpe.4s
+ %tmp1 = load <4 x float>* %A
+ %tmp3 = call <4 x float> @llvm.aarch64.neon.frecpe.v4f32(<4 x float> %tmp1)
+ ret <4 x float> %tmp3
+}
+
+define <2 x double> @frecpe_2d(<2 x double>* %A) nounwind {
+;CHECK-LABEL: frecpe_2d:
+;CHECK: frecpe.2d
+ %tmp1 = load <2 x double>* %A
+ %tmp3 = call <2 x double> @llvm.aarch64.neon.frecpe.v2f64(<2 x double> %tmp1)
+ ret <2 x double> %tmp3
+}
+
+define float @frecpe_s(float* %A) nounwind {
+;CHECK-LABEL: frecpe_s:
+;CHECK: frecpe s0, {{s[0-9]+}}
+ %tmp1 = load float* %A
+ %tmp3 = call float @llvm.aarch64.neon.frecpe.f32(float %tmp1)
+ ret float %tmp3
+}
+
+define double @frecpe_d(double* %A) nounwind {
+;CHECK-LABEL: frecpe_d:
+;CHECK: frecpe d0, {{d[0-9]+}}
+ %tmp1 = load double* %A
+ %tmp3 = call double @llvm.aarch64.neon.frecpe.f64(double %tmp1)
+ ret double %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.frecpe.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.frecpe.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.frecpe.v2f64(<2 x double>) nounwind readnone
+declare float @llvm.aarch64.neon.frecpe.f32(float) nounwind readnone
+declare double @llvm.aarch64.neon.frecpe.f64(double) nounwind readnone
+
+define float @frecpx_s(float* %A) nounwind {
+;CHECK-LABEL: frecpx_s:
+;CHECK: frecpx s0, {{s[0-9]+}}
+ %tmp1 = load float* %A
+ %tmp3 = call float @llvm.aarch64.neon.frecpx.f32(float %tmp1)
+ ret float %tmp3
+}
+
+define double @frecpx_d(double* %A) nounwind {
+;CHECK-LABEL: frecpx_d:
+;CHECK: frecpx d0, {{d[0-9]+}}
+ %tmp1 = load double* %A
+ %tmp3 = call double @llvm.aarch64.neon.frecpx.f64(double %tmp1)
+ ret double %tmp3
+}
+
+declare float @llvm.aarch64.neon.frecpx.f32(float) nounwind readnone
+declare double @llvm.aarch64.neon.frecpx.f64(double) nounwind readnone
+
+define <2 x float> @frsqrte_2s(<2 x float>* %A) nounwind {
+;CHECK-LABEL: frsqrte_2s:
+;CHECK: frsqrte.2s
+ %tmp1 = load <2 x float>* %A
+ %tmp3 = call <2 x float> @llvm.aarch64.neon.frsqrte.v2f32(<2 x float> %tmp1)
+ ret <2 x float> %tmp3
+}
+
+define <4 x float> @frsqrte_4s(<4 x float>* %A) nounwind {
+;CHECK-LABEL: frsqrte_4s:
+;CHECK: frsqrte.4s
+ %tmp1 = load <4 x float>* %A
+ %tmp3 = call <4 x float> @llvm.aarch64.neon.frsqrte.v4f32(<4 x float> %tmp1)
+ ret <4 x float> %tmp3
+}
+
+define <2 x double> @frsqrte_2d(<2 x double>* %A) nounwind {
+;CHECK-LABEL: frsqrte_2d:
+;CHECK: frsqrte.2d
+ %tmp1 = load <2 x double>* %A
+ %tmp3 = call <2 x double> @llvm.aarch64.neon.frsqrte.v2f64(<2 x double> %tmp1)
+ ret <2 x double> %tmp3
+}
+
+define float @frsqrte_s(float* %A) nounwind {
+;CHECK-LABEL: frsqrte_s:
+;CHECK: frsqrte s0, {{s[0-9]+}}
+ %tmp1 = load float* %A
+ %tmp3 = call float @llvm.aarch64.neon.frsqrte.f32(float %tmp1)
+ ret float %tmp3
+}
+
+define double @frsqrte_d(double* %A) nounwind {
+;CHECK-LABEL: frsqrte_d:
+;CHECK: frsqrte d0, {{d[0-9]+}}
+ %tmp1 = load double* %A
+ %tmp3 = call double @llvm.aarch64.neon.frsqrte.f64(double %tmp1)
+ ret double %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.frsqrte.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.frsqrte.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.frsqrte.v2f64(<2 x double>) nounwind readnone
+declare float @llvm.aarch64.neon.frsqrte.f32(float) nounwind readnone
+declare double @llvm.aarch64.neon.frsqrte.f64(double) nounwind readnone
+
+define <2 x i32> @urecpe_2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: urecpe_2s:
+;CHECK: urecpe.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.urecpe.v2i32(<2 x i32> %tmp1)
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @urecpe_4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: urecpe_4s:
+;CHECK: urecpe.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.urecpe.v4i32(<4 x i32> %tmp1)
+ ret <4 x i32> %tmp3
+}
+
+declare <2 x i32> @llvm.aarch64.neon.urecpe.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.urecpe.v4i32(<4 x i32>) nounwind readnone
+
+define <2 x i32> @ursqrte_2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: ursqrte_2s:
+;CHECK: ursqrte.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.ursqrte.v2i32(<2 x i32> %tmp1)
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @ursqrte_4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: ursqrte_4s:
+;CHECK: ursqrte.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp3 = call <4 x i32> @llvm.aarch64.neon.ursqrte.v4i32(<4 x i32> %tmp1)
+ ret <4 x i32> %tmp3
+}
+
+declare <2 x i32> @llvm.aarch64.neon.ursqrte.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.ursqrte.v4i32(<4 x i32>) nounwind readnone
+
+define float @f1(float %a, float %b) nounwind readnone optsize ssp {
+; CHECK-LABEL: f1:
+; CHECK: frsqrts s0, s0, s1
+; CHECK-NEXT: ret
+ %vrsqrtss.i = tail call float @llvm.aarch64.neon.frsqrts.f32(float %a, float %b) nounwind
+ ret float %vrsqrtss.i
+}
+
+define double @f2(double %a, double %b) nounwind readnone optsize ssp {
+; CHECK-LABEL: f2:
+; CHECK: frsqrts d0, d0, d1
+; CHECK-NEXT: ret
+ %vrsqrtsd.i = tail call double @llvm.aarch64.neon.frsqrts.f64(double %a, double %b) nounwind
+ ret double %vrsqrtsd.i
+}
+
+declare double @llvm.aarch64.neon.frsqrts.f64(double, double) nounwind readnone
+declare float @llvm.aarch64.neon.frsqrts.f32(float, float) nounwind readnone
diff --git a/test/CodeGen/AArch64/arm64-vsra.ll b/test/CodeGen/AArch64/arm64-vsra.ll
new file mode 100644
index 0000000..5e9cef3
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vsra.ll
@@ -0,0 +1,150 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @vsras8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vsras8:
+;CHECK: ssra.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = ashr <8 x i8> %tmp2, < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >
+ %tmp4 = add <8 x i8> %tmp1, %tmp3
+ ret <8 x i8> %tmp4
+}
+
+define <4 x i16> @vsras16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: vsras16:
+;CHECK: ssra.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = ashr <4 x i16> %tmp2, < i16 15, i16 15, i16 15, i16 15 >
+ %tmp4 = add <4 x i16> %tmp1, %tmp3
+ ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @vsras32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: vsras32:
+;CHECK: ssra.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = ashr <2 x i32> %tmp2, < i32 31, i32 31 >
+ %tmp4 = add <2 x i32> %tmp1, %tmp3
+ ret <2 x i32> %tmp4
+}
+
+define <16 x i8> @vsraQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: vsraQs8:
+;CHECK: ssra.16b
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = ashr <16 x i8> %tmp2, < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >
+ %tmp4 = add <16 x i8> %tmp1, %tmp3
+ ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @vsraQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: vsraQs16:
+;CHECK: ssra.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = ashr <8 x i16> %tmp2, < i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15 >
+ %tmp4 = add <8 x i16> %tmp1, %tmp3
+ ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vsraQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: vsraQs32:
+;CHECK: ssra.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = ashr <4 x i32> %tmp2, < i32 31, i32 31, i32 31, i32 31 >
+ %tmp4 = add <4 x i32> %tmp1, %tmp3
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @vsraQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: vsraQs64:
+;CHECK: ssra.2d
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i64>* %B
+ %tmp3 = ashr <2 x i64> %tmp2, < i64 63, i64 63 >
+ %tmp4 = add <2 x i64> %tmp1, %tmp3
+ ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @vsrau8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vsrau8:
+;CHECK: usra.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = lshr <8 x i8> %tmp2, < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >
+ %tmp4 = add <8 x i8> %tmp1, %tmp3
+ ret <8 x i8> %tmp4
+}
+
+define <4 x i16> @vsrau16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: vsrau16:
+;CHECK: usra.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = lshr <4 x i16> %tmp2, < i16 15, i16 15, i16 15, i16 15 >
+ %tmp4 = add <4 x i16> %tmp1, %tmp3
+ ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @vsrau32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: vsrau32:
+;CHECK: usra.2s
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = lshr <2 x i32> %tmp2, < i32 31, i32 31 >
+ %tmp4 = add <2 x i32> %tmp1, %tmp3
+ ret <2 x i32> %tmp4
+}
+
+
+define <16 x i8> @vsraQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: vsraQu8:
+;CHECK: usra.16b
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = lshr <16 x i8> %tmp2, < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >
+ %tmp4 = add <16 x i8> %tmp1, %tmp3
+ ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @vsraQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: vsraQu16:
+;CHECK: usra.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = lshr <8 x i16> %tmp2, < i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15 >
+ %tmp4 = add <8 x i16> %tmp1, %tmp3
+ ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vsraQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: vsraQu32:
+;CHECK: usra.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = lshr <4 x i32> %tmp2, < i32 31, i32 31, i32 31, i32 31 >
+ %tmp4 = add <4 x i32> %tmp1, %tmp3
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @vsraQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: vsraQu64:
+;CHECK: usra.2d
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i64>* %B
+ %tmp3 = lshr <2 x i64> %tmp2, < i64 63, i64 63 >
+ %tmp4 = add <2 x i64> %tmp1, %tmp3
+ ret <2 x i64> %tmp4
+}
+
+define <1 x i64> @vsra_v1i64(<1 x i64> %A, <1 x i64> %B) nounwind {
+; CHECK-LABEL: vsra_v1i64:
+; CHECK: ssra d0, d1, #63
+ %tmp3 = ashr <1 x i64> %B, < i64 63 >
+ %tmp4 = add <1 x i64> %A, %tmp3
+ ret <1 x i64> %tmp4
+}
diff --git a/test/CodeGen/AArch64/arm64-vsub.ll b/test/CodeGen/AArch64/arm64-vsub.ll
new file mode 100644
index 0000000..c2c8755
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-vsub.ll
@@ -0,0 +1,417 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @subhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: subhn8b:
+;CHECK: subhn.8b
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.subhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @subhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: subhn4h:
+;CHECK: subhn.4h
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.subhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @subhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: subhn2s:
+;CHECK: subhn.2s
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i64>* %B
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.subhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @subhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
+;CHECK-LABEL: subhn2_16b:
+;CHECK: subhn.8b
+;CHECK-NEXT: subhn2.16b
+ %vsubhn2.i = tail call <8 x i8> @llvm.aarch64.neon.subhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+ %vsubhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.subhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+ %res = shufflevector <8 x i8> %vsubhn2.i, <8 x i8> %vsubhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %res
+}
+
+define <8 x i16> @subhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
+;CHECK-LABEL: subhn2_8h:
+;CHECK: subhn.4h
+;CHECK-NEXT: subhn2.8h
+ %vsubhn2.i = tail call <4 x i16> @llvm.aarch64.neon.subhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+ %vsubhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.subhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+ %res = shufflevector <4 x i16> %vsubhn2.i, <4 x i16> %vsubhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i16> %res
+}
+
+define <4 x i32> @subhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind {
+;CHECK-LABEL: subhn2_4s:
+;CHECK: subhn.2s
+;CHECK-NEXT: subhn2.4s
+ %vsubhn2.i = tail call <2 x i32> @llvm.aarch64.neon.subhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+ %vsubhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.subhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+ %res = shufflevector <2 x i32> %vsubhn2.i, <2 x i32> %vsubhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x i32> %res
+}
+
+declare <2 x i32> @llvm.aarch64.neon.subhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.subhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.subhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i8> @rsubhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: rsubhn8b:
+;CHECK: rsubhn.8b
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
+ ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @rsubhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: rsubhn4h:
+;CHECK: rsubhn.4h
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
+ ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @rsubhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: rsubhn2s:
+;CHECK: rsubhn.2s
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i64>* %B
+ %tmp3 = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
+ ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @rsubhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
+;CHECK-LABEL: rsubhn2_16b:
+;CHECK: rsubhn.8b
+;CHECK-NEXT: rsubhn2.16b
+ %vrsubhn2.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+ %vrsubhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+ %res = shufflevector <8 x i8> %vrsubhn2.i, <8 x i8> %vrsubhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %res
+}
+
+define <8 x i16> @rsubhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
+;CHECK-LABEL: rsubhn2_8h:
+;CHECK: rsubhn.4h
+;CHECK-NEXT: rsubhn2.8h
+ %vrsubhn2.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+ %vrsubhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+ %res = shufflevector <4 x i16> %vrsubhn2.i, <4 x i16> %vrsubhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i16> %res
+}
+
+define <4 x i32> @rsubhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind {
+;CHECK-LABEL: rsubhn2_4s:
+;CHECK: rsubhn.2s
+;CHECK-NEXT: rsubhn2.4s
+ %vrsubhn2.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+ %vrsubhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+ %res = shufflevector <2 x i32> %vrsubhn2.i, <2 x i32> %vrsubhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x i32> %res
+}
+
+declare <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @ssubl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: ssubl8h:
+;CHECK: ssubl.8h
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
+ %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
+ %tmp5 = sub <8 x i16> %tmp3, %tmp4
+ ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @ssubl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: ssubl4s:
+;CHECK: ssubl.4s
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
+ %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
+ %tmp5 = sub <4 x i32> %tmp3, %tmp4
+ ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @ssubl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: ssubl2d:
+;CHECK: ssubl.2d
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
+ %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
+ %tmp5 = sub <2 x i64> %tmp3, %tmp4
+ ret <2 x i64> %tmp5
+}
+
+define <8 x i16> @ssubl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: ssubl2_8h:
+;CHECK: ssubl2.8h
+ %tmp1 = load <16 x i8>* %A
+ %high1 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %ext1 = sext <8 x i8> %high1 to <8 x i16>
+
+ %tmp2 = load <16 x i8>* %B
+ %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %ext2 = sext <8 x i8> %high2 to <8 x i16>
+
+ %res = sub <8 x i16> %ext1, %ext2
+ ret <8 x i16> %res
+}
+
+define <4 x i32> @ssubl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: ssubl2_4s:
+;CHECK: ssubl2.4s
+ %tmp1 = load <8 x i16>* %A
+ %high1 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %ext1 = sext <4 x i16> %high1 to <4 x i32>
+
+ %tmp2 = load <8 x i16>* %B
+ %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %ext2 = sext <4 x i16> %high2 to <4 x i32>
+
+ %res = sub <4 x i32> %ext1, %ext2
+ ret <4 x i32> %res
+}
+
+define <2 x i64> @ssubl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: ssubl2_2d:
+;CHECK: ssubl2.2d
+ %tmp1 = load <4 x i32>* %A
+ %high1 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ %ext1 = sext <2 x i32> %high1 to <2 x i64>
+
+ %tmp2 = load <4 x i32>* %B
+ %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ %ext2 = sext <2 x i32> %high2 to <2 x i64>
+
+ %res = sub <2 x i64> %ext1, %ext2
+ ret <2 x i64> %res
+}
+
+define <8 x i16> @usubl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: usubl8h:
+;CHECK: usubl.8h
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
+ %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
+ %tmp5 = sub <8 x i16> %tmp3, %tmp4
+ ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @usubl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: usubl4s:
+;CHECK: usubl.4s
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
+ %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
+ %tmp5 = sub <4 x i32> %tmp3, %tmp4
+ ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @usubl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: usubl2d:
+;CHECK: usubl.2d
+ %tmp1 = load <2 x i32>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
+ %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
+ %tmp5 = sub <2 x i64> %tmp3, %tmp4
+ ret <2 x i64> %tmp5
+}
+
+define <8 x i16> @usubl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: usubl2_8h:
+;CHECK: usubl2.8h
+ %tmp1 = load <16 x i8>* %A
+ %high1 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %ext1 = zext <8 x i8> %high1 to <8 x i16>
+
+ %tmp2 = load <16 x i8>* %B
+ %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %ext2 = zext <8 x i8> %high2 to <8 x i16>
+
+ %res = sub <8 x i16> %ext1, %ext2
+ ret <8 x i16> %res
+}
+
+define <4 x i32> @usubl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: usubl2_4s:
+;CHECK: usubl2.4s
+ %tmp1 = load <8 x i16>* %A
+ %high1 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %ext1 = zext <4 x i16> %high1 to <4 x i32>
+
+ %tmp2 = load <8 x i16>* %B
+ %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %ext2 = zext <4 x i16> %high2 to <4 x i32>
+
+ %res = sub <4 x i32> %ext1, %ext2
+ ret <4 x i32> %res
+}
+
+define <2 x i64> @usubl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: usubl2_2d:
+;CHECK: usubl2.2d
+ %tmp1 = load <4 x i32>* %A
+ %high1 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ %ext1 = zext <2 x i32> %high1 to <2 x i64>
+
+ %tmp2 = load <4 x i32>* %B
+ %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ %ext2 = zext <2 x i32> %high2 to <2 x i64>
+
+ %res = sub <2 x i64> %ext1, %ext2
+ ret <2 x i64> %res
+}
+
+define <8 x i16> @ssubw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: ssubw8h:
+;CHECK: ssubw.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = sext <8 x i8> %tmp2 to <8 x i16>
+ %tmp4 = sub <8 x i16> %tmp1, %tmp3
+ ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @ssubw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: ssubw4s:
+;CHECK: ssubw.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = sext <4 x i16> %tmp2 to <4 x i32>
+ %tmp4 = sub <4 x i32> %tmp1, %tmp3
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @ssubw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: ssubw2d:
+;CHECK: ssubw.2d
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = sext <2 x i32> %tmp2 to <2 x i64>
+ %tmp4 = sub <2 x i64> %tmp1, %tmp3
+ ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @ssubw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: ssubw2_8h:
+;CHECK: ssubw2.8h
+ %tmp1 = load <8 x i16>* %A
+
+ %tmp2 = load <16 x i8>* %B
+ %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %ext2 = sext <8 x i8> %high2 to <8 x i16>
+
+ %res = sub <8 x i16> %tmp1, %ext2
+ ret <8 x i16> %res
+}
+
+define <4 x i32> @ssubw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: ssubw2_4s:
+;CHECK: ssubw2.4s
+ %tmp1 = load <4 x i32>* %A
+
+ %tmp2 = load <8 x i16>* %B
+ %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %ext2 = sext <4 x i16> %high2 to <4 x i32>
+
+ %res = sub <4 x i32> %tmp1, %ext2
+ ret <4 x i32> %res
+}
+
+define <2 x i64> @ssubw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: ssubw2_2d:
+;CHECK: ssubw2.2d
+ %tmp1 = load <2 x i64>* %A
+
+ %tmp2 = load <4 x i32>* %B
+ %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ %ext2 = sext <2 x i32> %high2 to <2 x i64>
+
+ %res = sub <2 x i64> %tmp1, %ext2
+ ret <2 x i64> %res
+}
+
+define <8 x i16> @usubw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: usubw8h:
+;CHECK: usubw.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = zext <8 x i8> %tmp2 to <8 x i16>
+ %tmp4 = sub <8 x i16> %tmp1, %tmp3
+ ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @usubw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: usubw4s:
+;CHECK: usubw.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = zext <4 x i16> %tmp2 to <4 x i32>
+ %tmp4 = sub <4 x i32> %tmp1, %tmp3
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @usubw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: usubw2d:
+;CHECK: usubw.2d
+ %tmp1 = load <2 x i64>* %A
+ %tmp2 = load <2 x i32>* %B
+ %tmp3 = zext <2 x i32> %tmp2 to <2 x i64>
+ %tmp4 = sub <2 x i64> %tmp1, %tmp3
+ ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @usubw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: usubw2_8h:
+;CHECK: usubw2.8h
+ %tmp1 = load <8 x i16>* %A
+
+ %tmp2 = load <16 x i8>* %B
+ %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %ext2 = zext <8 x i8> %high2 to <8 x i16>
+
+ %res = sub <8 x i16> %tmp1, %ext2
+ ret <8 x i16> %res
+}
+
+define <4 x i32> @usubw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: usubw2_4s:
+;CHECK: usubw2.4s
+ %tmp1 = load <4 x i32>* %A
+
+ %tmp2 = load <8 x i16>* %B
+ %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %ext2 = zext <4 x i16> %high2 to <4 x i32>
+
+ %res = sub <4 x i32> %tmp1, %ext2
+ ret <4 x i32> %res
+}
+
+define <2 x i64> @usubw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: usubw2_2d:
+;CHECK: usubw2.2d
+ %tmp1 = load <2 x i64>* %A
+
+ %tmp2 = load <4 x i32>* %B
+ %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+ %ext2 = zext <2 x i32> %high2 to <2 x i64>
+
+ %res = sub <2 x i64> %tmp1, %ext2
+ ret <2 x i64> %res
+}
diff --git a/test/CodeGen/AArch64/arm64-weak-reference.ll b/test/CodeGen/AArch64/arm64-weak-reference.ll
new file mode 100644
index 0000000..b2135e0
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-weak-reference.ll
@@ -0,0 +1,10 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios | FileCheck %s
+
+@x = extern_weak global i32
+
+define i32 @fn() nounwind ssp {
+; CHECK-LABEL: fn:
+; CHECK: .weak_reference
+ %val = load i32* @x, align 4
+ ret i32 %val
+}
diff --git a/test/CodeGen/AArch64/arm64-xaluo.ll b/test/CodeGen/AArch64/arm64-xaluo.ll
new file mode 100644
index 0000000..6cffbde
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-xaluo.ll
@@ -0,0 +1,524 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+;
+; Get the actual value of the overflow bit.
+;
+define i1 @saddo.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; CHECK-LABEL: saddo.i32
+; CHECK: adds w8, w0, w1
+; CHECK-NEXT: cset w0, vs
+ %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
+ %val = extractvalue {i32, i1} %t, 0
+ %obit = extractvalue {i32, i1} %t, 1
+ store i32 %val, i32* %res
+ ret i1 %obit
+}
+
+define i1 @saddo.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; CHECK-LABEL: saddo.i64
+; CHECK: adds x8, x0, x1
+; CHECK-NEXT: cset w0, vs
+ %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
+ %val = extractvalue {i64, i1} %t, 0
+ %obit = extractvalue {i64, i1} %t, 1
+ store i64 %val, i64* %res
+ ret i1 %obit
+}
+
+define i1 @uaddo.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; CHECK-LABEL: uaddo.i32
+; CHECK: adds w8, w0, w1
+; CHECK-NEXT: cset w0, hs
+ %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
+ %val = extractvalue {i32, i1} %t, 0
+ %obit = extractvalue {i32, i1} %t, 1
+ store i32 %val, i32* %res
+ ret i1 %obit
+}
+
+define i1 @uaddo.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; CHECK-LABEL: uaddo.i64
+; CHECK: adds x8, x0, x1
+; CHECK-NEXT: cset w0, hs
+ %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
+ %val = extractvalue {i64, i1} %t, 0
+ %obit = extractvalue {i64, i1} %t, 1
+ store i64 %val, i64* %res
+ ret i1 %obit
+}
+
+define i1 @ssubo.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; CHECK-LABEL: ssubo.i32
+; CHECK: subs w8, w0, w1
+; CHECK-NEXT: cset w0, vs
+ %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
+ %val = extractvalue {i32, i1} %t, 0
+ %obit = extractvalue {i32, i1} %t, 1
+ store i32 %val, i32* %res
+ ret i1 %obit
+}
+
+define i1 @ssubo.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; CHECK-LABEL: ssubo.i64
+; CHECK: subs x8, x0, x1
+; CHECK-NEXT: cset w0, vs
+ %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
+ %val = extractvalue {i64, i1} %t, 0
+ %obit = extractvalue {i64, i1} %t, 1
+ store i64 %val, i64* %res
+ ret i1 %obit
+}
+
+define i1 @usubo.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; CHECK-LABEL: usubo.i32
+; CHECK: subs w8, w0, w1
+; CHECK-NEXT: cset w0, lo
+ %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
+ %val = extractvalue {i32, i1} %t, 0
+ %obit = extractvalue {i32, i1} %t, 1
+ store i32 %val, i32* %res
+ ret i1 %obit
+}
+
+define i1 @usubo.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; CHECK-LABEL: usubo.i64
+; CHECK: subs x8, x0, x1
+; CHECK-NEXT: cset w0, lo
+ %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
+ %val = extractvalue {i64, i1} %t, 0
+ %obit = extractvalue {i64, i1} %t, 1
+ store i64 %val, i64* %res
+ ret i1 %obit
+}
+
+define i1 @smulo.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; CHECK-LABEL: smulo.i32
+; CHECK: smull x8, w0, w1
+; CHECK-NEXT: lsr x9, x8, #32
+; CHECK-NEXT: cmp w9, w8, asr #31
+; CHECK-NEXT: cset w0, ne
+ %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
+ %val = extractvalue {i32, i1} %t, 0
+ %obit = extractvalue {i32, i1} %t, 1
+ store i32 %val, i32* %res
+ ret i1 %obit
+}
+
+define i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; CHECK-LABEL: smulo.i64
+; CHECK: mul x8, x0, x1
+; CHECK-NEXT: smulh x9, x0, x1
+; CHECK-NEXT: cmp x9, x8, asr #63
+; CHECK-NEXT: cset w0, ne
+ %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
+ %val = extractvalue {i64, i1} %t, 0
+ %obit = extractvalue {i64, i1} %t, 1
+ store i64 %val, i64* %res
+ ret i1 %obit
+}
+
+define i1 @umulo.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; CHECK-LABEL: umulo.i32
+; CHECK: umull x8, w0, w1
+; CHECK-NEXT: cmp xzr, x8, lsr #32
+; CHECK-NEXT: cset w0, ne
+ %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
+ %val = extractvalue {i32, i1} %t, 0
+ %obit = extractvalue {i32, i1} %t, 1
+ store i32 %val, i32* %res
+ ret i1 %obit
+}
+
+define i1 @umulo.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; CHECK-LABEL: umulo.i64
+; CHECK: umulh x8, x0, x1
+; CHECK-NEXT: cmp xzr, x8
+; CHECK-NEXT: cset w8, ne
+; CHECK-NEXT: mul x9, x0, x1
+ %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
+ %val = extractvalue {i64, i1} %t, 0
+ %obit = extractvalue {i64, i1} %t, 1
+ store i64 %val, i64* %res
+ ret i1 %obit
+}
+
+
+;
+; Check the use of the overflow bit in combination with a select instruction.
+;
+define i32 @saddo.select.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL: saddo.select.i32
+; CHECK: cmn w0, w1
+; CHECK-NEXT: csel w0, w0, w1, vs
+ %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
+ %obit = extractvalue {i32, i1} %t, 1
+ %ret = select i1 %obit, i32 %v1, i32 %v2
+ ret i32 %ret
+}
+
+define i64 @saddo.select.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL: saddo.select.i64
+; CHECK: cmn x0, x1
+; CHECK-NEXT: csel x0, x0, x1, vs
+ %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
+ %obit = extractvalue {i64, i1} %t, 1
+ %ret = select i1 %obit, i64 %v1, i64 %v2
+ ret i64 %ret
+}
+
+define i32 @uaddo.select.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL: uaddo.select.i32
+; CHECK: cmn w0, w1
+; CHECK-NEXT: csel w0, w0, w1, hs
+ %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
+ %obit = extractvalue {i32, i1} %t, 1
+ %ret = select i1 %obit, i32 %v1, i32 %v2
+ ret i32 %ret
+}
+
+define i64 @uaddo.select.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL: uaddo.select.i64
+; CHECK: cmn x0, x1
+; CHECK-NEXT: csel x0, x0, x1, hs
+ %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
+ %obit = extractvalue {i64, i1} %t, 1
+ %ret = select i1 %obit, i64 %v1, i64 %v2
+ ret i64 %ret
+}
+
+define i32 @ssubo.select.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL: ssubo.select.i32
+; CHECK: cmp w0, w1
+; CHECK-NEXT: csel w0, w0, w1, vs
+ %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
+ %obit = extractvalue {i32, i1} %t, 1
+ %ret = select i1 %obit, i32 %v1, i32 %v2
+ ret i32 %ret
+}
+
+define i64 @ssubo.select.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL: ssubo.select.i64
+; CHECK: cmp x0, x1
+; CHECK-NEXT: csel x0, x0, x1, vs
+ %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
+ %obit = extractvalue {i64, i1} %t, 1
+ %ret = select i1 %obit, i64 %v1, i64 %v2
+ ret i64 %ret
+}
+
+define i32 @usubo.select.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL: usubo.select.i32
+; CHECK: cmp w0, w1
+; CHECK-NEXT: csel w0, w0, w1, lo
+ %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
+ %obit = extractvalue {i32, i1} %t, 1
+ %ret = select i1 %obit, i32 %v1, i32 %v2
+ ret i32 %ret
+}
+
+define i64 @usubo.select.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL: usubo.select.i64
+; CHECK: cmp x0, x1
+; CHECK-NEXT: csel x0, x0, x1, lo
+ %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
+ %obit = extractvalue {i64, i1} %t, 1
+ %ret = select i1 %obit, i64 %v1, i64 %v2
+ ret i64 %ret
+}
+
+define i32 @smulo.select.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL: smulo.select.i32
+; CHECK: smull x8, w0, w1
+; CHECK-NEXT: lsr x9, x8, #32
+; CHECK-NEXT: cmp w9, w8, asr #31
+; CHECK-NEXT: csel w0, w0, w1, ne
+ %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
+ %obit = extractvalue {i32, i1} %t, 1
+ %ret = select i1 %obit, i32 %v1, i32 %v2
+ ret i32 %ret
+}
+
+define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL: smulo.select.i64
+; CHECK: mul x8, x0, x1
+; CHECK-NEXT: smulh x9, x0, x1
+; CHECK-NEXT: cmp x9, x8, asr #63
+; CHECK-NEXT: csel x0, x0, x1, ne
+ %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
+ %obit = extractvalue {i64, i1} %t, 1
+ %ret = select i1 %obit, i64 %v1, i64 %v2
+ ret i64 %ret
+}
+
+define i32 @umulo.select.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL: umulo.select.i32
+; CHECK: umull x8, w0, w1
+; CHECK-NEXT: cmp xzr, x8, lsr #32
+; CHECK-NEXT: csel w0, w0, w1, ne
+ %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
+ %obit = extractvalue {i32, i1} %t, 1
+ %ret = select i1 %obit, i32 %v1, i32 %v2
+ ret i32 %ret
+}
+
+define i64 @umulo.select.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL: umulo.select.i64
+; CHECK: umulh x8, x0, x1
+; CHECK-NEXT: cmp xzr, x8
+; CHECK-NEXT: csel x0, x0, x1, ne
+ %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
+ %obit = extractvalue {i64, i1} %t, 1
+ %ret = select i1 %obit, i64 %v1, i64 %v2
+ ret i64 %ret
+}
+
+
+;
+; Check the use of the overflow bit in combination with a branch instruction.
+;
+define i1 @saddo.br.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL: saddo.br.i32
+; CHECK: cmn w0, w1
+; CHECK-NEXT: b.vc
+ %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
+ %val = extractvalue {i32, i1} %t, 0
+ %obit = extractvalue {i32, i1} %t, 1
+ br i1 %obit, label %overflow, label %continue
+
+overflow:
+ ret i1 false
+
+continue:
+ ret i1 true
+}
+
+define i1 @saddo.br.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL: saddo.br.i64
+; CHECK: cmn x0, x1
+; CHECK-NEXT: b.vc
+ %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
+ %val = extractvalue {i64, i1} %t, 0
+ %obit = extractvalue {i64, i1} %t, 1
+ br i1 %obit, label %overflow, label %continue
+
+overflow:
+ ret i1 false
+
+continue:
+ ret i1 true
+}
+
+define i1 @uaddo.br.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL: uaddo.br.i32
+; CHECK: cmn w0, w1
+; CHECK-NEXT: b.lo
+ %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
+ %val = extractvalue {i32, i1} %t, 0
+ %obit = extractvalue {i32, i1} %t, 1
+ br i1 %obit, label %overflow, label %continue
+
+overflow:
+ ret i1 false
+
+continue:
+ ret i1 true
+}
+
+define i1 @uaddo.br.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL: uaddo.br.i64
+; CHECK: cmn x0, x1
+; CHECK-NEXT: b.lo
+ %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
+ %val = extractvalue {i64, i1} %t, 0
+ %obit = extractvalue {i64, i1} %t, 1
+ br i1 %obit, label %overflow, label %continue
+
+overflow:
+ ret i1 false
+
+continue:
+ ret i1 true
+}
+
+define i1 @ssubo.br.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL: ssubo.br.i32
+; CHECK: cmp w0, w1
+; CHECK-NEXT: b.vc
+ %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
+ %val = extractvalue {i32, i1} %t, 0
+ %obit = extractvalue {i32, i1} %t, 1
+ br i1 %obit, label %overflow, label %continue
+
+overflow:
+ ret i1 false
+
+continue:
+ ret i1 true
+}
+
+define i1 @ssubo.br.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL: ssubo.br.i64
+; CHECK: cmp x0, x1
+; CHECK-NEXT: b.vc
+ %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
+ %val = extractvalue {i64, i1} %t, 0
+ %obit = extractvalue {i64, i1} %t, 1
+ br i1 %obit, label %overflow, label %continue
+
+overflow:
+ ret i1 false
+
+continue:
+ ret i1 true
+}
+
+define i1 @usubo.br.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL: usubo.br.i32
+; CHECK: cmp w0, w1
+; CHECK-NEXT: b.hs
+ %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
+ %val = extractvalue {i32, i1} %t, 0
+ %obit = extractvalue {i32, i1} %t, 1
+ br i1 %obit, label %overflow, label %continue
+
+overflow:
+ ret i1 false
+
+continue:
+ ret i1 true
+}
+
+define i1 @usubo.br.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL: usubo.br.i64
+; CHECK: cmp x0, x1
+; CHECK-NEXT: b.hs
+ %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
+ %val = extractvalue {i64, i1} %t, 0
+ %obit = extractvalue {i64, i1} %t, 1
+ br i1 %obit, label %overflow, label %continue
+
+overflow:
+ ret i1 false
+
+continue:
+ ret i1 true
+}
+
+define i1 @smulo.br.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL: smulo.br.i32
+; CHECK: smull x8, w0, w1
+; CHECK-NEXT: lsr x9, x8, #32
+; CHECK-NEXT: cmp w9, w8, asr #31
+; CHECK-NEXT: b.eq
+ %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
+ %val = extractvalue {i32, i1} %t, 0
+ %obit = extractvalue {i32, i1} %t, 1
+ br i1 %obit, label %overflow, label %continue
+
+overflow:
+ ret i1 false
+
+continue:
+ ret i1 true
+}
+
+define i1 @smulo.br.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL: smulo.br.i64
+; CHECK: mul x8, x0, x1
+; CHECK-NEXT: smulh x9, x0, x1
+; CHECK-NEXT: cmp x9, x8, asr #63
+; CHECK-NEXT: b.eq
+ %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
+ %val = extractvalue {i64, i1} %t, 0
+ %obit = extractvalue {i64, i1} %t, 1
+ br i1 %obit, label %overflow, label %continue
+
+overflow:
+ ret i1 false
+
+continue:
+ ret i1 true
+}
+
+define i1 @umulo.br.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL: umulo.br.i32
+; CHECK: umull x8, w0, w1
+; CHECK-NEXT: cmp xzr, x8, lsr #32
+; CHECK-NEXT: b.eq
+ %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
+ %val = extractvalue {i32, i1} %t, 0
+ %obit = extractvalue {i32, i1} %t, 1
+ br i1 %obit, label %overflow, label %continue
+
+overflow:
+ ret i1 false
+
+continue:
+ ret i1 true
+}
+
+define i1 @umulo.br.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL: umulo.br.i64
+; CHECK: umulh x8, x0, x1
+; CHECK-NEXT: cbz
+ %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
+ %val = extractvalue {i64, i1} %t, 0
+ %obit = extractvalue {i64, i1} %t, 1
+ br i1 %obit, label %overflow, label %continue
+
+overflow:
+ ret i1 false
+
+continue:
+ ret i1 true
+}
+
+declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.usub.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.usub.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.smul.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.smul.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.umul.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.umul.with.overflow.i64(i64, i64) nounwind readnone
+
diff --git a/test/CodeGen/AArch64/arm64-zero-cycle-regmov.ll b/test/CodeGen/AArch64/arm64-zero-cycle-regmov.ll
new file mode 100644
index 0000000..c56d607
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-zero-cycle-regmov.ll
@@ -0,0 +1,17 @@
+; RUN: llc -mtriple=arm64-apple-ios -mcpu=cyclone < %s | FileCheck %s
+; rdar://12254953
+
+define i32 @t(i32 %a, i32 %b, i32 %c, i32 %d) nounwind ssp {
+entry:
+; CHECK-LABEL: t:
+; CHECK: mov x0, [[REG1:x[0-9]+]]
+; CHECK: mov x1, [[REG2:x[0-9]+]]
+; CHECK: bl _foo
+; CHECK: mov x0, [[REG1]]
+; CHECK: mov x1, [[REG2]]
+ %call = call i32 @foo(i32 %c, i32 %d) nounwind
+ %call1 = call i32 @foo(i32 %c, i32 %d) nounwind
+ unreachable
+}
+
+declare i32 @foo(i32, i32)
diff --git a/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll b/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll
new file mode 100644
index 0000000..349bb6f
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll
@@ -0,0 +1,49 @@
+; RUN: llc -mtriple=arm64-apple-ios -mcpu=cyclone < %s | FileCheck %s
+; rdar://11481771
+; rdar://13713797
+
+define void @t1() nounwind ssp {
+entry:
+; CHECK-LABEL: t1:
+; CHECK-NOT: fmov
+; CHECK: movi.2d v0, #0000000000000000
+; CHECK: movi.2d v1, #0000000000000000
+; CHECK: movi.2d v2, #0000000000000000
+; CHECK: movi.2d v3, #0000000000000000
+ tail call void @bar(double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00) nounwind
+ ret void
+}
+
+define void @t2() nounwind ssp {
+entry:
+; CHECK-LABEL: t2:
+; CHECK-NOT: mov w0, wzr
+; CHECK: movz w0, #0
+; CHECK: movz w1, #0
+ tail call void @bari(i32 0, i32 0) nounwind
+ ret void
+}
+
+define void @t3() nounwind ssp {
+entry:
+; CHECK-LABEL: t3:
+; CHECK-NOT: mov x0, xzr
+; CHECK: movz x0, #0
+; CHECK: movz x1, #0
+ tail call void @barl(i64 0, i64 0) nounwind
+ ret void
+}
+
+define void @t4() nounwind ssp {
+; CHECK-LABEL: t4:
+; CHECK-NOT: fmov
+; CHECK: movi.2d v0, #0000000000000000
+; CHECK: movi.2d v1, #0000000000000000
+ tail call void @barf(float 0.000000e+00, float 0.000000e+00) nounwind
+ ret void
+}
+
+declare void @bar(double, double, double, double)
+declare void @bari(i32, i32)
+declare void @barl(i64, i64)
+declare void @barf(float, float)
diff --git a/test/CodeGen/AArch64/arm64-zext.ll b/test/CodeGen/AArch64/arm64-zext.ll
new file mode 100644
index 0000000..8d9e5ea
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-zext.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define i64 @foo(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: foo:
+; CHECK: add w0, w1, w0
+; CHECK: ret
+ %add = add i32 %b, %a
+ %conv = zext i32 %add to i64
+ ret i64 %conv
+}
diff --git a/test/CodeGen/AArch64/arm64-zextload-unscaled.ll b/test/CodeGen/AArch64/arm64-zextload-unscaled.ll
new file mode 100644
index 0000000..c475dbd
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-zextload-unscaled.ll
@@ -0,0 +1,40 @@
+; RUN: llc -march=arm64 < %s | FileCheck %s
+
+@var32 = global i32 0
+
+define void @test_zextloadi1_unscaled(i1* %base) {
+; CHECK-LABEL: test_zextloadi1_unscaled:
+; CHECK: ldurb {{w[0-9]+}}, [{{x[0-9]+}}, #-7]
+
+ %addr = getelementptr i1* %base, i32 -7
+ %val = load i1* %addr, align 1
+
+ %extended = zext i1 %val to i32
+ store i32 %extended, i32* @var32, align 4
+ ret void
+}
+
+define void @test_zextloadi8_unscaled(i8* %base) {
+; CHECK-LABEL: test_zextloadi8_unscaled:
+; CHECK: ldurb {{w[0-9]+}}, [{{x[0-9]+}}, #-7]
+
+ %addr = getelementptr i8* %base, i32 -7
+ %val = load i8* %addr, align 1
+
+ %extended = zext i8 %val to i32
+ store i32 %extended, i32* @var32, align 4
+ ret void
+}
+
+define void @test_zextloadi16_unscaled(i16* %base) {
+; CHECK-LABEL: test_zextloadi16_unscaled:
+; CHECK: ldurh {{w[0-9]+}}, [{{x[0-9]+}}, #-14]
+
+ %addr = getelementptr i16* %base, i32 -7
+ %val = load i16* %addr, align 2
+
+ %extended = zext i16 %val to i32
+ store i32 %extended, i32* @var32, align 4
+ ret void
+}
+
diff --git a/test/CodeGen/AArch64/arm64-zip.ll b/test/CodeGen/AArch64/arm64-zip.ll
new file mode 100644
index 0000000..304b280
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-zip.ll
@@ -0,0 +1,107 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @vzipi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vzipi8:
+;CHECK: zip1.8b
+;CHECK: zip2.8b
+;CHECK-NEXT: add.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+ %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+ %tmp5 = add <8 x i8> %tmp3, %tmp4
+ ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @vzipi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: vzipi16:
+;CHECK: zip1.4h
+;CHECK: zip2.4h
+;CHECK-NEXT: add.4h
+ %tmp1 = load <4 x i16>* %A
+ %tmp2 = load <4 x i16>* %B
+ %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ %tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ %tmp5 = add <4 x i16> %tmp3, %tmp4
+ ret <4 x i16> %tmp5
+}
+
+define <16 x i8> @vzipQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: vzipQi8:
+;CHECK: zip1.16b
+;CHECK: zip2.16b
+;CHECK-NEXT: add.16b
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+ %tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+ %tmp5 = add <16 x i8> %tmp3, %tmp4
+ ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @vzipQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: vzipQi16:
+;CHECK: zip1.8h
+;CHECK: zip2.8h
+;CHECK-NEXT: add.8h
+ %tmp1 = load <8 x i16>* %A
+ %tmp2 = load <8 x i16>* %B
+ %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+ %tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+ %tmp5 = add <8 x i16> %tmp3, %tmp4
+ ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @vzipQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: vzipQi32:
+;CHECK: zip1.4s
+;CHECK: zip2.4s
+;CHECK-NEXT: add.4s
+ %tmp1 = load <4 x i32>* %A
+ %tmp2 = load <4 x i32>* %B
+ %tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ %tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ %tmp5 = add <4 x i32> %tmp3, %tmp4
+ ret <4 x i32> %tmp5
+}
+
+define <4 x float> @vzipQf(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: vzipQf:
+;CHECK: zip1.4s
+;CHECK: zip2.4s
+;CHECK-NEXT: fadd.4s
+ %tmp1 = load <4 x float>* %A
+ %tmp2 = load <4 x float>* %B
+ %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ %tmp4 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ %tmp5 = fadd <4 x float> %tmp3, %tmp4
+ ret <4 x float> %tmp5
+}
+
+; Undef shuffle indices should not prevent matching to VZIP:
+
+define <8 x i8> @vzipi8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vzipi8_undef:
+;CHECK: zip1.8b
+;CHECK: zip2.8b
+;CHECK-NEXT: add.8b
+ %tmp1 = load <8 x i8>* %A
+ %tmp2 = load <8 x i8>* %B
+ %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 undef, i32 1, i32 9, i32 undef, i32 10, i32 3, i32 11>
+ %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 undef, i32 undef, i32 15>
+ %tmp5 = add <8 x i8> %tmp3, %tmp4
+ ret <8 x i8> %tmp5
+}
+
+define <16 x i8> @vzipQi8_undef(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: vzipQi8_undef:
+;CHECK: zip1.16b
+;CHECK: zip2.16b
+;CHECK-NEXT: add.16b
+ %tmp1 = load <16 x i8>* %A
+ %tmp2 = load <16 x i8>* %B
+ %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 16, i32 1, i32 undef, i32 undef, i32 undef, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+ %tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 8, i32 24, i32 9, i32 undef, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 undef, i32 14, i32 30, i32 undef, i32 31>
+ %tmp5 = add <16 x i8> %tmp3, %tmp4
+ ret <16 x i8> %tmp5
+}
diff --git a/test/CodeGen/AArch64/asm-large-immediate.ll b/test/CodeGen/AArch64/asm-large-immediate.ll
new file mode 100644
index 0000000..05e4ddd
--- /dev/null
+++ b/test/CodeGen/AArch64/asm-large-immediate.ll
@@ -0,0 +1,10 @@
+; RUN: llc -march=aarch64 -no-integrated-as < %s | FileCheck %s
+
+define void @test() {
+entry:
+; CHECK: /* result: 68719476738 */
+ tail call void asm sideeffect "/* result: ${0:c} */", "i,~{dirflag},~{fpsr},~{flags}"( i64 68719476738 )
+; CHECK: /* result: -68719476738 */
+ tail call void asm sideeffect "/* result: ${0:n} */", "i,~{dirflag},~{fpsr},~{flags}"( i64 68719476738 )
+ ret void
+}
diff --git a/test/CodeGen/AArch64/assertion-rc-mismatch.ll b/test/CodeGen/AArch64/assertion-rc-mismatch.ll
index 02b0c0e..bcf206e 100644
--- a/test/CodeGen/AArch64/assertion-rc-mismatch.ll
+++ b/test/CodeGen/AArch64/assertion-rc-mismatch.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
; Test case related to <rdar://problem/15633429>.
; CHECK-LABEL: small
diff --git a/test/CodeGen/AArch64/atomic-ops.ll b/test/CodeGen/AArch64/atomic-ops.ll
index 5fe2936..58b5d1d 100644
--- a/test/CodeGen/AArch64/atomic-ops.ll
+++ b/test/CodeGen/AArch64/atomic-ops.ll
@@ -1,5 +1,11 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-REG %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-REG
+
+
+; Point of CHECK-REG is to make sure UNPREDICTABLE instructions aren't created
+; (i.e. reusing a register for status & data in store exclusive).
+; CHECK-REG-NOT: stlxrb w[[NEW:[0-9]+]], w[[NEW]], [x{{[0-9]+}}]
+; CHECK-REG-NOT: stlxrb w[[NEW:[0-9]+]], x[[NEW]], [x{{[0-9]+}}]
@var8 = global i8 0
@var16 = global i16 0
@@ -11,20 +17,18 @@ define i8 @test_atomic_load_add_i8(i8 %offset) nounwind {
%old = atomicrmw add i8* @var8, i8 %offset seq_cst
; CHECK-NOT: dmb
; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
; w0 below is a reasonable guess but could change: it certainly comes into the
; function there.
; CHECK-NEXT: add [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-REG: add w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
-; CHECK-REG-NOT: stlxrb w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
; CHECK-NEXT: stlxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
; CHECK-NOT: dmb
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
ret i8 %old
}
@@ -33,20 +37,18 @@ define i16 @test_atomic_load_add_i16(i16 %offset) nounwind {
%old = atomicrmw add i16* @var16, i16 %offset acquire
; CHECK-NOT: dmb
; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var16
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldaxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
; w0 below is a reasonable guess but could change: it certainly comes into the
; function there.
; CHECK-NEXT: add [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-REG: add w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
-; CHECK-REG-NOT: stxrh w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
; CHECK-NOT: dmb
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
ret i16 %old
}
@@ -55,20 +57,18 @@ define i32 @test_atomic_load_add_i32(i32 %offset) nounwind {
%old = atomicrmw add i32* @var32, i32 %offset release
; CHECK-NOT: dmb
; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
; w0 below is a reasonable guess but could change: it certainly comes into the
; function there.
; CHECK-NEXT: add [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-REG: add w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
-; CHECK-REG-NOT: stlxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
; CHECK-NOT: dmb
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
ret i32 %old
}
@@ -77,15 +77,13 @@ define i64 @test_atomic_load_add_i64(i64 %offset) nounwind {
%old = atomicrmw add i64* @var64, i64 %offset monotonic
; CHECK-NOT: dmb
; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
; x0 below is a reasonable guess but could change: it certainly comes into the
; function there.
; CHECK-NEXT: add [[NEW:x[0-9]+]], x[[OLD]], x0
-; CHECK-REG: add x[[NEW:[0-9]+]], x{{[0-9]+}}, x0
-; CHECK-REG-NOT: stxr w[[NEW]], x[[NEW]], [x{{[0-9]+}}]
; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
; CHECK-NOT: dmb
@@ -99,20 +97,18 @@ define i8 @test_atomic_load_sub_i8(i8 %offset) nounwind {
%old = atomicrmw sub i8* @var8, i8 %offset monotonic
; CHECK-NOT: dmb
; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
; w0 below is a reasonable guess but could change: it certainly comes into the
; function there.
; CHECK-NEXT: sub [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-REG: sub w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
-; CHECK-REG-NOT: stxrb w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
; CHECK-NEXT: stxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
; CHECK-NOT: dmb
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
ret i8 %old
}
@@ -121,20 +117,18 @@ define i16 @test_atomic_load_sub_i16(i16 %offset) nounwind {
%old = atomicrmw sub i16* @var16, i16 %offset release
; CHECK-NOT: dmb
; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var16
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
; w0 below is a reasonable guess but could change: it certainly comes into the
; function there.
; CHECK-NEXT: sub [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-REG: sub w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
-; CHECK-REG-NOT: stlxrh w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
; CHECK-NEXT: stlxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
; CHECK-NOT: dmb
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
ret i16 %old
}
@@ -143,20 +137,18 @@ define i32 @test_atomic_load_sub_i32(i32 %offset) nounwind {
%old = atomicrmw sub i32* @var32, i32 %offset acquire
; CHECK-NOT: dmb
; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxr w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldaxr w[[OLD:[0-9]+]], [x[[ADDR]]]
; w0 below is a reasonable guess but could change: it certainly comes into the
; function there.
; CHECK-NEXT: sub [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-REG: sub w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
-; CHECK-REG-NOT: stxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
; CHECK-NOT: dmb
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
ret i32 %old
}
@@ -165,15 +157,13 @@ define i64 @test_atomic_load_sub_i64(i64 %offset) nounwind {
%old = atomicrmw sub i64* @var64, i64 %offset seq_cst
; CHECK-NOT: dmb
; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxr x[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldaxr x[[OLD:[0-9]+]], [x[[ADDR]]]
; x0 below is a reasonable guess but could change: it certainly comes into the
; function there.
; CHECK-NEXT: sub [[NEW:x[0-9]+]], x[[OLD]], x0
-; CHECK-REG: sub x[[NEW:[0-9]+]], x{{[0-9]+}}, x0
-; CHECK-REG-NOT: stlxr w[[NEW]], x[[NEW]], [x{{[0-9]+}}]
; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
; CHECK-NOT: dmb
@@ -187,20 +177,18 @@ define i8 @test_atomic_load_and_i8(i8 %offset) nounwind {
%old = atomicrmw and i8* @var8, i8 %offset release
; CHECK-NOT: dmb
; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
; w0 below is a reasonable guess but could change: it certainly comes into the
; function there.
; CHECK-NEXT: and [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-REG: and w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
-; CHECK-REG-NOT: stlxrb w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
; CHECK-NEXT: stlxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
; CHECK-NOT: dmb
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
ret i8 %old
}
@@ -209,20 +197,18 @@ define i16 @test_atomic_load_and_i16(i16 %offset) nounwind {
%old = atomicrmw and i16* @var16, i16 %offset monotonic
; CHECK-NOT: dmb
; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var16
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
; w0 below is a reasonable guess but could change: it certainly comes into the
; function there.
; CHECK-NEXT: and [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-REG: and w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
-; CHECK-REG-NOT: stxrh w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
; CHECK-NOT: dmb
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
ret i16 %old
}
@@ -231,20 +217,18 @@ define i32 @test_atomic_load_and_i32(i32 %offset) nounwind {
%old = atomicrmw and i32* @var32, i32 %offset seq_cst
; CHECK-NOT: dmb
; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxr w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldaxr w[[OLD:[0-9]+]], [x[[ADDR]]]
; w0 below is a reasonable guess but could change: it certainly comes into the
; function there.
; CHECK-NEXT: and [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-REG: and w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
-; CHECK-REG-NOT: stlxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
; CHECK-NOT: dmb
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
ret i32 %old
}
@@ -253,15 +237,13 @@ define i64 @test_atomic_load_and_i64(i64 %offset) nounwind {
%old = atomicrmw and i64* @var64, i64 %offset acquire
; CHECK-NOT: dmb
; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxr x[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldaxr x[[OLD:[0-9]+]], [x[[ADDR]]]
; x0 below is a reasonable guess but could change: it certainly comes into the
; function there.
; CHECK-NEXT: and [[NEW:x[0-9]+]], x[[OLD]], x0
-; CHECK-REG: and x[[NEW:[0-9]+]], x{{[0-9]+}}, x0
-; CHECK-REG-NOT: stxr w[[NEW]], x[[NEW]], [x{{[0-9]+}}]
; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
; CHECK-NOT: dmb
@@ -275,20 +257,18 @@ define i8 @test_atomic_load_or_i8(i8 %offset) nounwind {
%old = atomicrmw or i8* @var8, i8 %offset seq_cst
; CHECK-NOT: dmb
; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
; w0 below is a reasonable guess but could change: it certainly comes into the
; function there.
; CHECK-NEXT: orr [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-REG: orr w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
-; CHECK-REG-NOT: stlxrb w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
; CHECK-NEXT: stlxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
; CHECK-NOT: dmb
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
ret i8 %old
}
@@ -297,20 +277,18 @@ define i16 @test_atomic_load_or_i16(i16 %offset) nounwind {
%old = atomicrmw or i16* @var16, i16 %offset monotonic
; CHECK-NOT: dmb
; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var16
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
; w0 below is a reasonable guess but could change: it certainly comes into the
; function there.
; CHECK-NEXT: orr [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-REG: orr w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
-; CHECK-REG-NOT: stxrh w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
; CHECK-NOT: dmb
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
ret i16 %old
}
@@ -319,20 +297,18 @@ define i32 @test_atomic_load_or_i32(i32 %offset) nounwind {
%old = atomicrmw or i32* @var32, i32 %offset acquire
; CHECK-NOT: dmb
; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxr w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldaxr w[[OLD:[0-9]+]], [x[[ADDR]]]
; w0 below is a reasonable guess but could change: it certainly comes into the
; function there.
; CHECK-NEXT: orr [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-REG: orr w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
-; CHECK-REG-NOT: stxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
; CHECK-NOT: dmb
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
ret i32 %old
}
@@ -341,15 +317,13 @@ define i64 @test_atomic_load_or_i64(i64 %offset) nounwind {
%old = atomicrmw or i64* @var64, i64 %offset release
; CHECK-NOT: dmb
; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
; x0 below is a reasonable guess but could change: it certainly comes into the
; function there.
; CHECK-NEXT: orr [[NEW:x[0-9]+]], x[[OLD]], x0
-; CHECK-REG: orr x[[NEW:[0-9]+]], x{{[0-9]+}}, x0
-; CHECK-REG-NOT: stlxr w[[NEW]], x[[NEW]], [x{{[0-9]+}}]
; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
; CHECK-NOT: dmb
@@ -363,20 +337,18 @@ define i8 @test_atomic_load_xor_i8(i8 %offset) nounwind {
%old = atomicrmw xor i8* @var8, i8 %offset acquire
; CHECK-NOT: dmb
; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
; w0 below is a reasonable guess but could change: it certainly comes into the
; function there.
; CHECK-NEXT: eor [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-REG: eor w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
-; CHECK-REG-NOT: stxrb w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
; CHECK-NEXT: stxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
; CHECK-NOT: dmb
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
ret i8 %old
}
@@ -385,20 +357,18 @@ define i16 @test_atomic_load_xor_i16(i16 %offset) nounwind {
%old = atomicrmw xor i16* @var16, i16 %offset release
; CHECK-NOT: dmb
; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var16
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
; w0 below is a reasonable guess but could change: it certainly comes into the
; function there.
; CHECK-NEXT: eor [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-REG: eor w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
-; CHECK-REG-NOT: stxrh w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
; CHECK-NEXT: stlxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
; CHECK-NOT: dmb
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
ret i16 %old
}
@@ -407,20 +377,18 @@ define i32 @test_atomic_load_xor_i32(i32 %offset) nounwind {
%old = atomicrmw xor i32* @var32, i32 %offset seq_cst
; CHECK-NOT: dmb
; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxr w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldaxr w[[OLD:[0-9]+]], [x[[ADDR]]]
; w0 below is a reasonable guess but could change: it certainly comes into the
; function there.
; CHECK-NEXT: eor [[NEW:w[0-9]+]], w[[OLD]], w0
-; CHECK-REG: eor w[[NEW:[0-9]+]], w{{[0-9]+}}, w0
-; CHECK-REG-NOT: stlxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
; CHECK-NOT: dmb
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
ret i32 %old
}
@@ -429,15 +397,13 @@ define i64 @test_atomic_load_xor_i64(i64 %offset) nounwind {
%old = atomicrmw xor i64* @var64, i64 %offset monotonic
; CHECK-NOT: dmb
; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
; x0 below is a reasonable guess but could change: it certainly comes into the
; function there.
; CHECK-NEXT: eor [[NEW:x[0-9]+]], x[[OLD]], x0
-; CHECK-REG: eor x[[NEW:[0-9]+]], x{{[0-9]+}}, x0
-; CHECK-REG-NOT: stxr w[[NEW]], x[[NEW]], [x{{[0-9]+}}]
; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
; CHECK-NOT: dmb
@@ -451,18 +417,17 @@ define i8 @test_atomic_load_xchg_i8(i8 %offset) nounwind {
%old = atomicrmw xchg i8* @var8, i8 %offset monotonic
; CHECK-NOT: dmb
; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
; w0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-REG-NOT: stxrb w0, w0, [x{{[0-9]+}}]
; CHECK-NEXT: stxrb [[STATUS:w[0-9]+]], w0, [x[[ADDR]]]
; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
; CHECK-NOT: dmb
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
ret i8 %old
}
@@ -471,18 +436,17 @@ define i16 @test_atomic_load_xchg_i16(i16 %offset) nounwind {
%old = atomicrmw xchg i16* @var16, i16 %offset seq_cst
; CHECK-NOT: dmb
; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var16
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldaxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
; w0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-REG-NOT: stlxrh w0, w0, [x{{[0-9]+}}]
; CHECK-NEXT: stlxrh [[STATUS:w[0-9]+]], w0, [x[[ADDR]]]
; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
; CHECK-NOT: dmb
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
ret i16 %old
}
@@ -491,18 +455,17 @@ define i32 @test_atomic_load_xchg_i32(i32 %offset) nounwind {
%old = atomicrmw xchg i32* @var32, i32 %offset release
; CHECK-NOT: dmb
; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
; w0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-REG-NOT: stlxr w0, w0, [x{{[0-9]+}}]
; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], w0, [x[[ADDR]]]
; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
; CHECK-NOT: dmb
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
ret i32 %old
}
@@ -511,13 +474,12 @@ define i64 @test_atomic_load_xchg_i64(i64 %offset) nounwind {
%old = atomicrmw xchg i64* @var64, i64 %offset acquire
; CHECK-NOT: dmb
; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxr x[[OLD:[0-9]+]], [x[[ADDR]]]
+; ; CHECK: ldaxr x[[OLD:[0-9]+]], [x[[ADDR]]]
; x0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-REG-NOT: stxr w0, x0, [x{{[0-9]+}}]
; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], x0, [x[[ADDR]]]
; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
; CHECK-NOT: dmb
@@ -532,21 +494,22 @@ define i8 @test_atomic_load_min_i8(i8 %offset) nounwind {
%old = atomicrmw min i8* @var8, i8 %offset acquire
; CHECK-NOT: dmb
; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
; w0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-NEXT: cmp w0, w[[OLD]], sxtb
-; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt
-; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, gt
-; CHECK-REG-NOT: stxrb w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
+
+; CHECK-NEXT: sxtb w[[OLD_EXT:[0-9]+]], w[[OLD]]
+; CHECK-NEXT: cmp w[[OLD_EXT]], w0, sxtb
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, le
+
; CHECK-NEXT: stxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
; CHECK-NOT: dmb
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
ret i8 %old
}
@@ -555,21 +518,23 @@ define i16 @test_atomic_load_min_i16(i16 %offset) nounwind {
%old = atomicrmw min i16* @var16, i16 %offset release
; CHECK-NOT: dmb
; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var16
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
; w0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-NEXT: cmp w0, w[[OLD]], sxth
-; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt
-; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, gt
-; CHECK-REG-NOT: stlxrh w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
+
+; CHECK-NEXT: sxth w[[OLD_EXT:[0-9]+]], w[[OLD]]
+; CHECK-NEXT: cmp w[[OLD_EXT]], w0, sxth
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, le
+
+
; CHECK-NEXT: stlxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
; CHECK-NOT: dmb
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
ret i16 %old
}
@@ -578,21 +543,22 @@ define i32 @test_atomic_load_min_i32(i32 %offset) nounwind {
%old = atomicrmw min i32* @var32, i32 %offset monotonic
; CHECK-NOT: dmb
; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
; w0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-NEXT: cmp w0, w[[OLD]]
-; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt
-; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, gt
-; CHECK-REG-NOT: stxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
+
+; CHECK-NEXT: cmp w[[OLD]], w0
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, le
+
+
; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
; CHECK-NOT: dmb
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
ret i32 %old
}
@@ -601,16 +567,17 @@ define i64 @test_atomic_load_min_i64(i64 %offset) nounwind {
%old = atomicrmw min i64* @var64, i64 %offset seq_cst
; CHECK-NOT: dmb
; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxr x[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldaxr x[[OLD:[0-9]+]], [x[[ADDR]]]
; x0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-NEXT: cmp x0, x[[OLD]]
-; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, gt
-; CHECK-REG: csel x[[NEW:[0-9]+]], x{{[0-9]+}}, x0, gt
-; CHECK-REG-NOT: stlxr w[[NEW]], x[[NEW]], [x{{[0-9]+}}]
+
+; CHECK-NEXT: cmp x[[OLD]], x0
+; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, le
+
+
; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
; CHECK-NOT: dmb
@@ -624,21 +591,23 @@ define i8 @test_atomic_load_max_i8(i8 %offset) nounwind {
%old = atomicrmw max i8* @var8, i8 %offset seq_cst
; CHECK-NOT: dmb
; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
; w0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-NEXT: cmp w0, w[[OLD]], sxtb
-; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, lt
-; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, lt
-; CHECK-REG-NOT: stlxrb w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
+
+; CHECK-NEXT: sxtb w[[OLD_EXT:[0-9]+]], w[[OLD]]
+; CHECK-NEXT: cmp w[[OLD_EXT]], w0, sxtb
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt
+
+
; CHECK-NEXT: stlxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
; CHECK-NOT: dmb
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
ret i8 %old
}
@@ -647,21 +616,23 @@ define i16 @test_atomic_load_max_i16(i16 %offset) nounwind {
%old = atomicrmw max i16* @var16, i16 %offset acquire
; CHECK-NOT: dmb
; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var16
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldaxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
; w0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-NEXT: cmp w0, w[[OLD]], sxth
-; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, lt
-; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, lt
-; CHECK-REG-NOT: stxrh w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
+
+; CHECK-NEXT: sxth w[[OLD_EXT:[0-9]+]], w[[OLD]]
+; CHECK-NEXT: cmp w[[OLD_EXT]], w0, sxth
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt
+
+
; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
; CHECK-NOT: dmb
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
ret i16 %old
}
@@ -670,21 +641,22 @@ define i32 @test_atomic_load_max_i32(i32 %offset) nounwind {
%old = atomicrmw max i32* @var32, i32 %offset release
; CHECK-NOT: dmb
; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
; w0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-NEXT: cmp w0, w[[OLD]]
-; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, lt
-; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, lt
-; CHECK-REG-NOT: stlxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
+
+; CHECK-NEXT: cmp w[[OLD]], w0
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt
+
+
; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
; CHECK-NOT: dmb
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
ret i32 %old
}
@@ -693,16 +665,17 @@ define i64 @test_atomic_load_max_i64(i64 %offset) nounwind {
%old = atomicrmw max i64* @var64, i64 %offset monotonic
; CHECK-NOT: dmb
; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
; x0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-NEXT: cmp x0, x[[OLD]]
-; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, lt
-; CHECK-REG: csel x[[NEW:[0-9]+]], x{{[0-9]+}}, x0, lt
-; CHECK-REG-NOT: stlxr w[[NEW]], x[[NEW]], [x{{[0-9]+}}]
+
+; CHECK-NEXT: cmp x[[OLD]], x0
+; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, gt
+
+
; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
; CHECK-NOT: dmb
@@ -716,21 +689,22 @@ define i8 @test_atomic_load_umin_i8(i8 %offset) nounwind {
%old = atomicrmw umin i8* @var8, i8 %offset monotonic
; CHECK-NOT: dmb
; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
; w0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-NEXT: cmp w0, w[[OLD]], uxtb
-; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi
-; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, hi
-; CHECK-REG-NOT: stlxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
+
+; CHECK-NEXT: cmp w[[OLD]], w0, uxtb
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, ls
+
+
; CHECK-NEXT: stxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
; CHECK-NOT: dmb
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
ret i8 %old
}
@@ -739,21 +713,22 @@ define i16 @test_atomic_load_umin_i16(i16 %offset) nounwind {
%old = atomicrmw umin i16* @var16, i16 %offset acquire
; CHECK-NOT: dmb
; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var16
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldaxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
; w0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-NEXT: cmp w0, w[[OLD]], uxth
-; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi
-; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, hi
-; CHECK-REG-NOT: stxrh w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
+
+; CHECK-NEXT: cmp w[[OLD]], w0, uxth
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, ls
+
+
; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
; CHECK-NOT: dmb
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
ret i16 %old
}
@@ -762,21 +737,22 @@ define i32 @test_atomic_load_umin_i32(i32 %offset) nounwind {
%old = atomicrmw umin i32* @var32, i32 %offset seq_cst
; CHECK-NOT: dmb
; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxr w[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldaxr w[[OLD:[0-9]+]], [x[[ADDR]]]
; w0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-NEXT: cmp w0, w[[OLD]]
-; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi
-; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, hi
-; CHECK-REG-NOT: stlxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
+
+; CHECK-NEXT: cmp w[[OLD]], w0
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, ls
+
+
; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
; CHECK-NOT: dmb
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
ret i32 %old
}
@@ -785,16 +761,17 @@ define i64 @test_atomic_load_umin_i64(i64 %offset) nounwind {
%old = atomicrmw umin i64* @var64, i64 %offset acq_rel
; CHECK-NOT: dmb
; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxr x[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldaxr x[[OLD:[0-9]+]], [x[[ADDR]]]
; x0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-NEXT: cmp x0, x[[OLD]]
-; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, hi
-; CHECK-REG: csel x[[NEW:[0-9]+]], x{{[0-9]+}}, x0, hi
-; CHECK-REG-NOT: stlxr w[[NEW]], x[[NEW]], [x{{[0-9]+}}]
+
+; CHECK-NEXT: cmp x[[OLD]], x0
+; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, ls
+
+
; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
; CHECK-NOT: dmb
@@ -808,21 +785,22 @@ define i8 @test_atomic_load_umax_i8(i8 %offset) nounwind {
%old = atomicrmw umax i8* @var8, i8 %offset acq_rel
; CHECK-NOT: dmb
; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
; w0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-NEXT: cmp w0, w[[OLD]], uxtb
-; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, lo
-; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, lo
-; CHECK-REG-NOT: stlxrb w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
+
+; CHECK-NEXT: cmp w[[OLD]], w0, uxtb
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi
+
+
; CHECK-NEXT: stlxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
; CHECK-NOT: dmb
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
ret i8 %old
}
@@ -831,21 +809,22 @@ define i16 @test_atomic_load_umax_i16(i16 %offset) nounwind {
%old = atomicrmw umax i16* @var16, i16 %offset monotonic
; CHECK-NOT: dmb
; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var16
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
; w0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-NEXT: cmp w0, w[[OLD]], uxth
-; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, lo
-; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, lo
-; CHECK-REG-NOT: stxrh w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
+
+; CHECK-NEXT: cmp w[[OLD]], w0, uxth
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi
+
+
; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
; CHECK-NOT: dmb
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
ret i16 %old
}
@@ -854,21 +833,22 @@ define i32 @test_atomic_load_umax_i32(i32 %offset) nounwind {
%old = atomicrmw umax i32* @var32, i32 %offset seq_cst
; CHECK-NOT: dmb
; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldaxr w[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldaxr w[[OLD:[0-9]+]], [x[[ADDR]]]
; w0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-NEXT: cmp w0, w[[OLD]]
-; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, lo
-; CHECK-REG: csel w[[NEW:[0-9]+]], w{{[0-9]+}}, w0, lo
-; CHECK-REG-NOT: stlxr w[[NEW]], w[[NEW]], [x{{[0-9]+}}]
+
+; CHECK-NEXT: cmp w[[OLD]], w0
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi
+
+
; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
; CHECK-NOT: dmb
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
ret i32 %old
}
@@ -877,16 +857,17 @@ define i64 @test_atomic_load_umax_i64(i64 %offset) nounwind {
%old = atomicrmw umax i64* @var64, i64 %offset release
; CHECK-NOT: dmb
; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
; CHECK: .LBB{{[0-9]+}}_1:
-; CHECK-NEXT: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
; x0 below is a reasonable guess but could change: it certainly comes into the
; function there.
-; CHECK-NEXT: cmp x0, x[[OLD]]
-; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, lo
-; CHECK-REG: csel x[[NEW:[0-9]+]], x{{[0-9]+}}, x0, lo
-; CHECK-REG-NOT: stlxr w[[NEW]], x[[NEW]], [x{{[0-9]+}}]
+
+; CHECK-NEXT: cmp x[[OLD]], x0
+; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, hi
+
+
; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
; CHECK-NOT: dmb
@@ -900,21 +881,20 @@ define i8 @test_atomic_cmpxchg_i8(i8 %wanted, i8 %new) nounwind {
%old = cmpxchg i8* @var8, i8 %wanted, i8 %new acquire acquire
; CHECK-NOT: dmb
; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
; CHECK: [[STARTAGAIN:.LBB[0-9]+_[0-9]+]]:
-; CHECK-NEXT: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldaxrb w[[OLD:[0-9]+]], [x[[ADDR]]]
; w0 below is a reasonable guess but could change: it certainly comes into the
; function there.
; CHECK-NEXT: cmp w[[OLD]], w0
; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]]
; As above, w1 is a reasonable guess.
-; CHECK-REG-NOT: stxrb w1, w1, [x{{[0-9]+}}]
; CHECK: stxrb [[STATUS:w[0-9]+]], w1, [x[[ADDR]]]
; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]]
; CHECK-NOT: dmb
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
ret i8 %old
}
@@ -923,21 +903,20 @@ define i16 @test_atomic_cmpxchg_i16(i16 %wanted, i16 %new) nounwind {
%old = cmpxchg i16* @var16, i16 %wanted, i16 %new seq_cst seq_cst
; CHECK-NOT: dmb
; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var16
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
; CHECK: [[STARTAGAIN:.LBB[0-9]+_[0-9]+]]:
-; CHECK-NEXT: ldaxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldaxrh w[[OLD:[0-9]+]], [x[[ADDR]]]
; w0 below is a reasonable guess but could change: it certainly comes into the
; function there.
; CHECK-NEXT: cmp w[[OLD]], w0
; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]]
; As above, w1 is a reasonable guess.
-; CHECK-REG-NOT: stlxrh w1, w1, [x{{[0-9]+}}]
; CHECK: stlxrh [[STATUS:w[0-9]+]], w1, [x[[ADDR]]]
; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]]
; CHECK-NOT: dmb
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
ret i16 %old
}
@@ -946,45 +925,44 @@ define i32 @test_atomic_cmpxchg_i32(i32 %wanted, i32 %new) nounwind {
%old = cmpxchg i32* @var32, i32 %wanted, i32 %new release monotonic
; CHECK-NOT: dmb
; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var32
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
; CHECK: [[STARTAGAIN:.LBB[0-9]+_[0-9]+]]:
-; CHECK-NEXT: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
; w0 below is a reasonable guess but could change: it certainly comes into the
; function there.
; CHECK-NEXT: cmp w[[OLD]], w0
; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]]
; As above, w1 is a reasonable guess.
-; CHECK-REG-NOT: stlxr w1, w1, [x{{[0-9]+}}]
; CHECK: stlxr [[STATUS:w[0-9]+]], w1, [x[[ADDR]]]
; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]]
; CHECK-NOT: dmb
-; CHECK: mov x0, x[[OLD]]
+; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
ret i32 %old
}
-define i64 @test_atomic_cmpxchg_i64(i64 %wanted, i64 %new) nounwind {
+define void @test_atomic_cmpxchg_i64(i64 %wanted, i64 %new) nounwind {
; CHECK-LABEL: test_atomic_cmpxchg_i64:
%old = cmpxchg i64* @var64, i64 %wanted, i64 %new monotonic monotonic
; CHECK-NOT: dmb
; CHECK: adrp [[TMPADDR:x[0-9]+]], var64
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var64
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var64
; CHECK: [[STARTAGAIN:.LBB[0-9]+_[0-9]+]]:
-; CHECK-NEXT: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
+; CHECK: ldxr x[[OLD:[0-9]+]], [x[[ADDR]]]
; w0 below is a reasonable guess but could change: it certainly comes into the
; function there.
; CHECK-NEXT: cmp x[[OLD]], x0
; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]]
; As above, w1 is a reasonable guess.
-; CHECK-REG-NOT: stxr w1, x1, [x{{[0-9]+}}]
; CHECK: stxr [[STATUS:w[0-9]+]], x1, [x[[ADDR]]]
; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]]
; CHECK-NOT: dmb
-; CHECK: mov x0, x[[OLD]]
- ret i64 %old
+; CHECK: str x[[OLD]],
+ store i64 %old, i64* @var64
+ ret void
}
define i8 @test_atomic_load_monotonic_i8() nounwind {
@@ -992,7 +970,7 @@ define i8 @test_atomic_load_monotonic_i8() nounwind {
%val = load atomic i8* @var8 monotonic, align 1
; CHECK-NOT: dmb
; CHECK: adrp x[[HIADDR:[0-9]+]], var8
-; CHECK: ldrb w0, [x[[HIADDR]], #:lo12:var8]
+; CHECK: ldrb w0, [x[[HIADDR]], {{#?}}:lo12:var8]
; CHECK-NOT: dmb
ret i8 %val
@@ -1017,7 +995,7 @@ define i8 @test_atomic_load_acquire_i8() nounwind {
; CHECK-NOT: dmb
; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
; CHECK-NOT: dmb
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], #:lo12:var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
; CHECK-NOT: dmb
; CHECK: ldarb w0, [x[[ADDR]]]
; CHECK-NOT: dmb
@@ -1030,7 +1008,7 @@ define i8 @test_atomic_load_seq_cst_i8() nounwind {
; CHECK-NOT: dmb
; CHECK: adrp [[HIADDR:x[0-9]+]], var8
; CHECK-NOT: dmb
-; CHECK: add x[[ADDR:[0-9]+]], [[HIADDR]], #:lo12:var8
+; CHECK: add x[[ADDR:[0-9]+]], [[HIADDR]], {{#?}}:lo12:var8
; CHECK-NOT: dmb
; CHECK: ldarb w0, [x[[ADDR]]]
; CHECK-NOT: dmb
@@ -1043,7 +1021,7 @@ define i16 @test_atomic_load_monotonic_i16() nounwind {
; CHECK-NOT: dmb
; CHECK: adrp x[[HIADDR:[0-9]+]], var16
; CHECK-NOT: dmb
-; CHECK: ldrh w0, [x[[HIADDR]], #:lo12:var16]
+; CHECK: ldrh w0, [x[[HIADDR]], {{#?}}:lo12:var16]
; CHECK-NOT: dmb
ret i16 %val
@@ -1068,7 +1046,7 @@ define i64 @test_atomic_load_seq_cst_i64() nounwind {
; CHECK-NOT: dmb
; CHECK: adrp [[HIADDR:x[0-9]+]], var64
; CHECK-NOT: dmb
-; CHECK: add x[[ADDR:[0-9]+]], [[HIADDR]], #:lo12:var64
+; CHECK: add x[[ADDR:[0-9]+]], [[HIADDR]], {{#?}}:lo12:var64
; CHECK-NOT: dmb
; CHECK: ldar x0, [x[[ADDR]]]
; CHECK-NOT: dmb
@@ -1079,7 +1057,7 @@ define void @test_atomic_store_monotonic_i8(i8 %val) nounwind {
; CHECK-LABEL: test_atomic_store_monotonic_i8:
store atomic i8 %val, i8* @var8 monotonic, align 1
; CHECK: adrp x[[HIADDR:[0-9]+]], var8
-; CHECK: strb w0, [x[[HIADDR]], #:lo12:var8]
+; CHECK: strb w0, [x[[HIADDR]], {{#?}}:lo12:var8]
ret void
}
@@ -1101,7 +1079,7 @@ define void @test_atomic_store_release_i8(i8 %val) nounwind {
; CHECK-NOT: dmb
; CHECK: adrp [[HIADDR:x[0-9]+]], var8
; CHECK-NOT: dmb
-; CHECK: add x[[ADDR:[0-9]+]], [[HIADDR]], #:lo12:var8
+; CHECK: add x[[ADDR:[0-9]+]], [[HIADDR]], {{#?}}:lo12:var8
; CHECK-NOT: dmb
; CHECK: stlrb w0, [x[[ADDR]]]
; CHECK-NOT: dmb
@@ -1114,7 +1092,7 @@ define void @test_atomic_store_seq_cst_i8(i8 %val) nounwind {
; CHECK-NOT: dmb
; CHECK: adrp [[HIADDR:x[0-9]+]], var8
; CHECK-NOT: dmb
-; CHECK: add x[[ADDR:[0-9]+]], [[HIADDR]], #:lo12:var8
+; CHECK: add x[[ADDR:[0-9]+]], [[HIADDR]], {{#?}}:lo12:var8
; CHECK-NOT: dmb
; CHECK: stlrb w0, [x[[ADDR]]]
; CHECK-NOT: dmb
@@ -1128,7 +1106,7 @@ define void @test_atomic_store_monotonic_i16(i16 %val) nounwind {
; CHECK-NOT: dmb
; CHECK: adrp x[[HIADDR:[0-9]+]], var16
; CHECK-NOT: dmb
-; CHECK: strh w0, [x[[HIADDR]], #:lo12:var16]
+; CHECK: strh w0, [x[[HIADDR]], {{#?}}:lo12:var16]
; CHECK-NOT: dmb
ret void
}
@@ -1153,7 +1131,7 @@ define void @test_atomic_store_release_i64(i64 %val) nounwind {
; CHECK-NOT: dmb
; CHECK: adrp [[HIADDR:x[0-9]+]], var64
; CHECK-NOT: dmb
-; CHECK: add x[[ADDR:[0-9]+]], [[HIADDR]], #:lo12:var64
+; CHECK: add x[[ADDR:[0-9]+]], [[HIADDR]], {{#?}}:lo12:var64
; CHECK-NOT: dmb
; CHECK: stlr x0, [x[[ADDR]]]
; CHECK-NOT: dmb
diff --git a/test/CodeGen/AArch64/basic-pic.ll b/test/CodeGen/AArch64/basic-pic.ll
index 682b7ba..62d41bc 100644
--- a/test/CodeGen/AArch64/basic-pic.ll
+++ b/test/CodeGen/AArch64/basic-pic.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -relocation-model=pic %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -relocation-model=pic %s -o - | FileCheck %s
@var = global i32 0
@@ -7,7 +7,7 @@ define i32 @get_globalvar() {
%val = load i32* @var
; CHECK: adrp x[[GOTHI:[0-9]+]], :got:var
-; CHECK: ldr x[[GOTLOC:[0-9]+]], [x[[GOTHI]], #:got_lo12:var]
+; CHECK: ldr x[[GOTLOC:[0-9]+]], [x[[GOTHI]], {{#?}}:got_lo12:var]
; CHECK: ldr w0, [x[[GOTLOC]]]
ret i32 %val
@@ -18,7 +18,7 @@ define i32* @get_globalvaraddr() {
%val = load i32* @var
; CHECK: adrp x[[GOTHI:[0-9]+]], :got:var
-; CHECK: ldr x0, [x[[GOTHI]], #:got_lo12:var]
+; CHECK: ldr x0, [x[[GOTHI]], {{#?}}:got_lo12:var]
ret i32* @var
}
@@ -30,7 +30,7 @@ define i32 @get_hiddenvar() {
%val = load i32* @hiddenvar
; CHECK: adrp x[[HI:[0-9]+]], hiddenvar
-; CHECK: ldr w0, [x[[HI]], #:lo12:hiddenvar]
+; CHECK: ldr w0, [x[[HI]], {{#?}}:lo12:hiddenvar]
ret i32 %val
}
@@ -40,7 +40,7 @@ define i32* @get_hiddenvaraddr() {
%val = load i32* @hiddenvar
; CHECK: adrp [[HI:x[0-9]+]], hiddenvar
-; CHECK: add x0, [[HI]], #:lo12:hiddenvar
+; CHECK: add x0, [[HI]], {{#?}}:lo12:hiddenvar
ret i32* @hiddenvar
}
@@ -50,5 +50,5 @@ define void()* @get_func() {
ret void()* bitcast(void()*()* @get_func to void()*)
; CHECK: adrp x[[GOTHI:[0-9]+]], :got:get_func
-; CHECK: ldr x0, [x[[GOTHI]], #:got_lo12:get_func]
+; CHECK: ldr x0, [x[[GOTHI]], {{#?}}:got_lo12:get_func]
}
diff --git a/test/CodeGen/AArch64/bitfield-insert-0.ll b/test/CodeGen/AArch64/bitfield-insert-0.ll
index 37a18b7..da0ed8a 100644
--- a/test/CodeGen/AArch64/bitfield-insert-0.ll
+++ b/test/CodeGen/AArch64/bitfield-insert-0.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -filetype=obj < %s | llvm-objdump -disassemble - | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -filetype=obj -o - %s | llvm-objdump -disassemble - | FileCheck %s
; The encoding of lsb -> immr in the CGed bitfield instructions was wrong at one
; point, in the edge case where lsb = 0. Just make sure.
diff --git a/test/CodeGen/AArch64/bitfield-insert.ll b/test/CodeGen/AArch64/bitfield-insert.ll
index 1f04608..2369a55 100644
--- a/test/CodeGen/AArch64/bitfield-insert.ll
+++ b/test/CodeGen/AArch64/bitfield-insert.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s --check-prefix=CHECK
; First, a simple example from Clang. The registers could plausibly be
; different, but probably won't be.
@@ -7,8 +7,7 @@
define [1 x i64] @from_clang([1 x i64] %f.coerce, i32 %n) nounwind readnone {
; CHECK-LABEL: from_clang:
-; CHECK: bfi w0, w1, #3, #4
-; CHECK-NEXT: ret
+; CHECK: bfi {{w[0-9]+}}, {{w[0-9]+}}, #3, #4
entry:
%f.coerce.fca.0.extract = extractvalue [1 x i64] %f.coerce, 0
@@ -26,6 +25,7 @@ entry:
define void @test_whole32(i32* %existing, i32* %new) {
; CHECK-LABEL: test_whole32:
+
; CHECK: bfi {{w[0-9]+}}, {{w[0-9]+}}, #26, #5
%oldval = load volatile i32* %existing
@@ -62,8 +62,10 @@ define void @test_whole64(i64* %existing, i64* %new) {
define void @test_whole32_from64(i64* %existing, i64* %new) {
; CHECK-LABEL: test_whole32_from64:
-; CHECK: bfi {{w[0-9]+}}, {{w[0-9]+}}, #{{0|16}}, #16
-; CHECK-NOT: and
+
+
+; CHECK: bfxil {{x[0-9]+}}, {{x[0-9]+}}, #0, #16
+
; CHECK: ret
%oldval = load volatile i64* %existing
@@ -80,8 +82,9 @@ define void @test_whole32_from64(i64* %existing, i64* %new) {
define void @test_32bit_masked(i32 *%existing, i32 *%new) {
; CHECK-LABEL: test_32bit_masked:
+
+; CHECK: and
; CHECK: bfi [[INSERT:w[0-9]+]], {{w[0-9]+}}, #3, #4
-; CHECK: and {{w[0-9]+}}, [[INSERT]], #0xff
%oldval = load volatile i32* %existing
%oldval_keep = and i32 %oldval, 135 ; = 0x87
@@ -98,8 +101,8 @@ define void @test_32bit_masked(i32 *%existing, i32 *%new) {
define void @test_64bit_masked(i64 *%existing, i64 *%new) {
; CHECK-LABEL: test_64bit_masked:
+; CHECK: and
; CHECK: bfi [[INSERT:x[0-9]+]], {{x[0-9]+}}, #40, #8
-; CHECK: and {{x[0-9]+}}, [[INSERT]], #0xffff00000000
%oldval = load volatile i64* %existing
%oldval_keep = and i64 %oldval, 1095216660480 ; = 0xff_0000_0000
@@ -117,8 +120,9 @@ define void @test_64bit_masked(i64 *%existing, i64 *%new) {
; Mask is too complicated for literal ANDwwi, make sure other avenues are tried.
define void @test_32bit_complexmask(i32 *%existing, i32 *%new) {
; CHECK-LABEL: test_32bit_complexmask:
+
+; CHECK: and
; CHECK: bfi {{w[0-9]+}}, {{w[0-9]+}}, #3, #4
-; CHECK: and {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
%oldval = load volatile i32* %existing
%oldval_keep = and i32 %oldval, 647 ; = 0x287
@@ -137,6 +141,7 @@ define void @test_32bit_complexmask(i32 *%existing, i32 *%new) {
define void @test_32bit_badmask(i32 *%existing, i32 *%new) {
; CHECK-LABEL: test_32bit_badmask:
; CHECK-NOT: bfi
+; CHECK-NOT: bfm
; CHECK: ret
%oldval = load volatile i32* %existing
@@ -156,6 +161,7 @@ define void @test_32bit_badmask(i32 *%existing, i32 *%new) {
define void @test_64bit_badmask(i64 *%existing, i64 *%new) {
; CHECK-LABEL: test_64bit_badmask:
; CHECK-NOT: bfi
+; CHECK-NOT: bfm
; CHECK: ret
%oldval = load volatile i64* %existing
@@ -186,8 +192,7 @@ define void @test_32bit_with_shr(i32* %existing, i32* %new) {
%combined = or i32 %oldval_keep, %newval_masked
store volatile i32 %combined, i32* %existing
; CHECK: lsr [[BIT:w[0-9]+]], {{w[0-9]+}}, #14
-; CHECK: bfi {{w[0-9]}}, [[BIT]], #26, #5
+; CHECK: bfi {{w[0-9]+}}, [[BIT]], #26, #5
ret void
}
-
diff --git a/test/CodeGen/AArch64/bitfield.ll b/test/CodeGen/AArch64/bitfield.ll
index 1c84f5d..0e12653 100644
--- a/test/CodeGen/AArch64/bitfield.ll
+++ b/test/CodeGen/AArch64/bitfield.ll
@@ -1,5 +1,4 @@
-
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s --check-prefix=CHECK
@var32 = global i32 0
@var64 = global i64 0
@@ -24,7 +23,7 @@ define void @test_extendb(i8 %var) {
%uxt64 = zext i8 %var to i64
store volatile i64 %uxt64, i64* @var64
-; CHECK: uxtb {{x[0-9]+}}, {{w[0-9]+}}
+; CHECK: and {{x[0-9]+}}, {{x[0-9]+}}, #0xff
ret void
}
@@ -48,7 +47,7 @@ define void @test_extendh(i16 %var) {
%uxt64 = zext i16 %var to i64
store volatile i64 %uxt64, i64* @var64
-; CHECK: uxth {{x[0-9]+}}, {{w[0-9]+}}
+; CHECK: and {{x[0-9]+}}, {{x[0-9]+}}, #0xffff
ret void
}
@@ -61,7 +60,7 @@ define void @test_extendw(i32 %var) {
%uxt64 = zext i32 %var to i64
store volatile i64 %uxt64, i64* @var64
-; CHECK: ubfx {{w[0-9]+}}, {{w[0-9]+}}, #0, #32
+; CHECK: ubfx {{x[0-9]+}}, {{x[0-9]+}}, #0, #32
ret void
}
@@ -190,7 +189,6 @@ define i32 @test_ubfx32(i32* %addr) {
define i64 @test_ubfx64(i64* %addr) {
; CHECK-LABEL: test_ubfx64:
; CHECK: ubfx {{x[0-9]+}}, {{x[0-9]+}}, #25, #10
-
%fields = load i64* %addr
%shifted = lshr i64 %fields, 25
%masked = and i64 %shifted, 1023
diff --git a/test/CodeGen/AArch64/blockaddress.ll b/test/CodeGen/AArch64/blockaddress.ll
index 8cda431..1eec4cc 100644
--- a/test/CodeGen/AArch64/blockaddress.ll
+++ b/test/CodeGen/AArch64/blockaddress.ll
@@ -9,7 +9,7 @@ define void @test_blockaddress() {
%val = load volatile i8** @addr
indirectbr i8* %val, [label %block]
; CHECK: adrp [[DEST_HI:x[0-9]+]], [[DEST_LBL:.Ltmp[0-9]+]]
-; CHECK: add [[DEST:x[0-9]+]], [[DEST_HI]], #:lo12:[[DEST_LBL]]
+; CHECK: add [[DEST:x[0-9]+]], [[DEST_HI]], {{#?}}:lo12:[[DEST_LBL]]
; CHECK: str [[DEST]],
; CHECK: ldr [[NEWDEST:x[0-9]+]]
; CHECK: br [[NEWDEST]]
diff --git a/test/CodeGen/AArch64/bool-loads.ll b/test/CodeGen/AArch64/bool-loads.ll
index 5c7640b..881aeaa 100644
--- a/test/CodeGen/AArch64/bool-loads.ll
+++ b/test/CodeGen/AArch64/bool-loads.ll
@@ -1,54 +1,54 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s
@var = global i1 0
define i32 @test_sextloadi32() {
-; CHECK: test_sextloadi32
+; CHECK-LABEL: test_sextloadi32
%val = load i1* @var
%ret = sext i1 %val to i32
-; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var]
-; CHECK: sbfx {{x[0-9]+}}, {{x[0-9]+}}, #0, #1
+; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var]
+; CHECK: {{sbfx x[0-9]+, x[0-9]+, #0, #1|sbfx w[0-9]+, w[0-9]+, #0, #1}}
ret i32 %ret
; CHECK: ret
}
define i64 @test_sextloadi64() {
-; CHECK: test_sextloadi64
+; CHECK-LABEL: test_sextloadi64
%val = load i1* @var
%ret = sext i1 %val to i64
-; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var]
-; CHECK: sbfx {{x[0-9]+}}, {{x[0-9]+}}, #0, #1
+; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var]
+; CHECK: {{sbfx x[0-9]+, x[0-9]+, #0, #1}}
ret i64 %ret
; CHECK: ret
}
define i32 @test_zextloadi32() {
-; CHECK: test_zextloadi32
+; CHECK-LABEL: test_zextloadi32
; It's not actually necessary that "ret" is next, but as far as LLVM
; is concerned only 0 or 1 should be loadable so no extension is
; necessary.
%val = load i1* @var
%ret = zext i1 %val to i32
-; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var]
+; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var]
ret i32 %ret
; CHECK-NEXT: ret
}
define i64 @test_zextloadi64() {
-; CHECK: test_zextloadi64
+; CHECK-LABEL: test_zextloadi64
; It's not actually necessary that "ret" is next, but as far as LLVM
; is concerned only 0 or 1 should be loadable so no extension is
; necessary.
%val = load i1* @var
%ret = zext i1 %val to i64
-; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var]
+; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var]
ret i64 %ret
; CHECK-NEXT: ret
diff --git a/test/CodeGen/AArch64/breg.ll b/test/CodeGen/AArch64/breg.ll
index 1ed5b9b..591f483 100644
--- a/test/CodeGen/AArch64/breg.ll
+++ b/test/CodeGen/AArch64/breg.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
@stored_label = global i8* null
@@ -7,7 +7,7 @@ define void @foo() {
%lab = load i8** @stored_label
indirectbr i8* %lab, [label %otherlab, label %retlab]
; CHECK: adrp {{x[0-9]+}}, stored_label
-; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:stored_label]
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:stored_label]
; CHECK: br {{x[0-9]+}}
otherlab:
diff --git a/test/CodeGen/AArch64/callee-save.ll b/test/CodeGen/AArch64/callee-save.ll
index 52243b0..046e6ce 100644
--- a/test/CodeGen/AArch64/callee-save.ll
+++ b/test/CodeGen/AArch64/callee-save.ll
@@ -1,14 +1,14 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
@var = global float 0.0
define void @foo() {
; CHECK-LABEL: foo:
-; CHECK: stp d14, d15, [sp
-; CHECK: stp d12, d13, [sp
-; CHECK: stp d10, d11, [sp
-; CHECK: stp d8, d9, [sp
+; CHECK: stp d15, d14, [sp
+; CHECK: stp d13, d12, [sp
+; CHECK: stp d11, d10, [sp
+; CHECK: stp d9, d8, [sp
; Create lots of live variables to exhaust the supply of
; caller-saved registers
@@ -78,9 +78,9 @@ define void @foo() {
store volatile float %val31, float* @var
store volatile float %val32, float* @var
-; CHECK: ldp d8, d9, [sp
-; CHECK: ldp d10, d11, [sp
-; CHECK: ldp d12, d13, [sp
-; CHECK: ldp d14, d15, [sp
+; CHECK: ldp d9, d8, [sp
+; CHECK: ldp d11, d10, [sp
+; CHECK: ldp d13, d12, [sp
+; CHECK: ldp d15, d14, [sp
ret void
}
diff --git a/test/CodeGen/AArch64/code-model-large-abs.ll b/test/CodeGen/AArch64/code-model-large-abs.ll
index b387f28..ca92500 100644
--- a/test/CodeGen/AArch64/code-model-large-abs.ll
+++ b/test/CodeGen/AArch64/code-model-large-abs.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -code-model=large < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -code-model=large -o - %s | FileCheck %s
@var8 = global i8 0
@var16 = global i16 0
diff --git a/test/CodeGen/AArch64/compare-branch.ll b/test/CodeGen/AArch64/compare-branch.ll
index 75efd9d..a1a87cf 100644
--- a/test/CodeGen/AArch64/compare-branch.ll
+++ b/test/CodeGen/AArch64/compare-branch.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
@var32 = global i32 0
@var64 = global i64 0
diff --git a/test/CodeGen/AArch64/concatvector-bugs.ll b/test/CodeGen/AArch64/concatvector-bugs.ll
deleted file mode 100644
index 5889e22..0000000
--- a/test/CodeGen/AArch64/concatvector-bugs.ll
+++ /dev/null
@@ -1,68 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon
-; Bug: i8 type in FRP8 register but not registering with register class causes segmentation fault.
-; Fix: Removed i8 type from FPR8 register class.
-
-define void @test_concatvector_v8i8() {
-entry.split:
- br i1 undef, label %if.then, label %if.end
-
-if.then: ; preds = %entry.split
- unreachable
-
-if.end: ; preds = %entry.split
- br i1 undef, label %if.then9, label %if.end18
-
-if.then9: ; preds = %if.end
- unreachable
-
-if.end18: ; preds = %if.end
- br label %for.body
-
-for.body: ; preds = %for.inc, %if.end18
- br i1 false, label %if.then30, label %for.inc
-
-if.then30: ; preds = %for.body
- unreachable
-
-for.inc: ; preds = %for.body
- br i1 undef, label %for.end, label %for.body
-
-for.end: ; preds = %for.inc
- br label %for.body77
-
-for.body77: ; preds = %for.body77, %for.end
- br i1 undef, label %for.end106, label %for.body77
-
-for.end106: ; preds = %for.body77
- br i1 undef, label %for.body130.us.us, label %stmt.for.body130.us.us
-
-stmt.for.body130.us.us: ; preds = %stmt.for.body130.us.us, %for.end106
- %_p_splat.us = shufflevector <1 x i8> zeroinitializer, <1 x i8> undef, <8 x i32> zeroinitializer
- store <8 x i8> %_p_splat.us, <8 x i8>* undef, align 1
- br label %stmt.for.body130.us.us
-
-for.body130.us.us: ; preds = %for.body130.us.us, %for.end106
- br label %for.body130.us.us
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vuqrshrn.v1i16(<1 x i32>, i32)
-
-define <8 x i16> @test_splat(i32 %l) nounwind {
-; CHECK-LABEL: test_splat:
-; CHECK: ret
- %lhs = insertelement <1 x i32> undef, i32 %l, i32 0
- %shift = tail call <1 x i16> @llvm.aarch64.neon.vuqrshrn.v1i16(<1 x i32> %lhs, i32 11)
- %vec = shufflevector <1 x i16> %shift, <1 x i16> undef, <8 x i32> zeroinitializer
- ret <8 x i16> %vec
-}
-
-
-define <8 x i16> @test_notsplat(<8 x i16> %a, <8 x i16> %b, i32 %l) nounwind {
-; CHECK-LABEL: test_notsplat:
-; CHECK: ret
-entry:
- %lhs = insertelement <1 x i32> undef, i32 %l, i32 0
- %shift = tail call <1 x i16> @llvm.aarch64.neon.vuqrshrn.v1i16(<1 x i32> %lhs, i32 11)
- %vec = shufflevector <1 x i16> %shift, <1 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 0, i32 0, i32 0>
- ret <8 x i16> %vec
-}
diff --git a/test/CodeGen/AArch64/cond-sel.ll b/test/CodeGen/AArch64/cond-sel.ll
index 9c1dfeb..5f81cba 100644
--- a/test/CodeGen/AArch64/cond-sel.ll
+++ b/test/CodeGen/AArch64/cond-sel.ll
@@ -1,25 +1,25 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mcpu=cyclone | FileCheck %s --check-prefix=CHECK
; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
@var32 = global i32 0
@var64 = global i64 0
-define void @test_csel(i32 %lhs32, i32 %rhs32, i64 %lhs64) {
+define void @test_csel(i32 %lhs32, i32 %rhs32, i64 %lhs64) minsize {
; CHECK-LABEL: test_csel:
%tst1 = icmp ugt i32 %lhs32, %rhs32
%val1 = select i1 %tst1, i32 42, i32 52
store i32 %val1, i32* @var32
-; CHECK-DAG: movz [[W52:w[0-9]+]], #52
-; CHECK-DAG: movz [[W42:w[0-9]+]], #42
+; CHECK-DAG: movz [[W52:w[0-9]+]], #{{52|0x34}}
+; CHECK-DAG: movz [[W42:w[0-9]+]], #{{42|0x2a}}
; CHECK: csel {{w[0-9]+}}, [[W42]], [[W52]], hi
%rhs64 = sext i32 %rhs32 to i64
%tst2 = icmp sle i64 %lhs64, %rhs64
%val2 = select i1 %tst2, i64 %lhs64, i64 %rhs64
store i64 %val2, i64* @var64
-; CHECK-DAG: cmp [[LHS:x[0-9]+]], [[RHS:w[0-9]+]], sxtw
-; CHECK-DAG: sxtw [[EXT_RHS:x[0-9]+]], [[RHS]]
+; CHECK: sxtw [[EXT_RHS:x[0-9]+]], {{[wx]}}[[RHS:[0-9]+]]
+; CHECK: cmp [[LHS:x[0-9]+]], w[[RHS]], sxtw
; CHECK: csel {{x[0-9]+}}, [[LHS]], [[EXT_RHS]], le
ret void
@@ -34,8 +34,8 @@ define void @test_floatcsel(float %lhs32, float %rhs32, double %lhs64, double %r
; CHECK-NOFP-NOT: fcmp
%val1 = select i1 %tst1, i32 42, i32 52
store i32 %val1, i32* @var32
-; CHECK: movz [[W52:w[0-9]+]], #52
-; CHECK: movz [[W42:w[0-9]+]], #42
+; CHECK: movz [[W52:w[0-9]+]], #{{52|0x34}}
+; CHECK: movz [[W42:w[0-9]+]], #{{42|0x2a}}
; CHECK: csel [[MAYBETRUE:w[0-9]+]], [[W42]], [[W52]], mi
; CHECK: csel {{w[0-9]+}}, [[W42]], [[MAYBETRUE]], gt
@@ -45,17 +45,17 @@ define void @test_floatcsel(float %lhs32, float %rhs32, double %lhs64, double %r
; CHECK-NOFP-NOT: fcmp
%val2 = select i1 %tst2, i64 9, i64 15
store i64 %val2, i64* @var64
-; CHECK: movz [[CONST15:x[0-9]+]], #15
-; CHECK: movz [[CONST9:x[0-9]+]], #9
-; CHECK: csel [[MAYBETRUE:x[0-9]+]], [[CONST9]], [[CONST15]], eq
-; CHECK: csel {{x[0-9]+}}, [[CONST9]], [[MAYBETRUE]], vs
+; CHECK: orr w[[CONST15:[0-9]+]], wzr, #0xf
+; CHECK: movz {{[wx]}}[[CONST9:[0-9]+]], #{{9|0x9}}
+; CHECK: csel [[MAYBETRUE:x[0-9]+]], x[[CONST9]], x[[CONST15]], eq
+; CHECK: csel {{x[0-9]+}}, x[[CONST9]], [[MAYBETRUE]], vs
ret void
; CHECK: ret
}
-define void @test_csinc(i32 %lhs32, i32 %rhs32, i64 %lhs64) {
+define void @test_csinc(i32 %lhs32, i32 %rhs32, i64 %lhs64) minsize {
; CHECK-LABEL: test_csinc:
; Note that commuting rhs and lhs in the select changes ugt to ule (i.e. hi to ls).
@@ -95,7 +95,7 @@ define void @test_csinc(i32 %lhs32, i32 %rhs32, i64 %lhs64) {
; CHECK: ret
}
-define void @test_csinv(i32 %lhs32, i32 %rhs32, i64 %lhs64) {
+define void @test_csinv(i32 %lhs32, i32 %rhs32, i64 %lhs64) minsize {
; CHECK-LABEL: test_csinv:
; Note that commuting rhs and lhs in the select changes ugt to ule (i.e. hi to ls).
@@ -135,7 +135,7 @@ define void @test_csinv(i32 %lhs32, i32 %rhs32, i64 %lhs64) {
; CHECK: ret
}
-define void @test_csneg(i32 %lhs32, i32 %rhs32, i64 %lhs64) {
+define void @test_csneg(i32 %lhs32, i32 %rhs32, i64 %lhs64) minsize {
; CHECK-LABEL: test_csneg:
; Note that commuting rhs and lhs in the select changes ugt to ule (i.e. hi to ls).
@@ -184,13 +184,13 @@ define void @test_cset(i32 %lhs, i32 %rhs, i64 %lhs64) {
%val1 = zext i1 %tst1 to i32
store i32 %val1, i32* @var32
; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: csinc {{w[0-9]+}}, wzr, wzr, ne
+; CHECK: cset {{w[0-9]+}}, eq
%rhs64 = sext i32 %rhs to i64
%tst2 = icmp ule i64 %lhs64, %rhs64
%val2 = zext i1 %tst2 to i64
store i64 %val2, i64* @var64
-; CHECK: csinc {{w[0-9]+}}, wzr, wzr, hi
+; CHECK: cset {{w[0-9]+}}, ls
ret void
; CHECK: ret
@@ -203,13 +203,13 @@ define void @test_csetm(i32 %lhs, i32 %rhs, i64 %lhs64) {
%val1 = sext i1 %tst1 to i32
store i32 %val1, i32* @var32
; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: csinv {{w[0-9]+}}, wzr, wzr, ne
+; CHECK: csetm {{w[0-9]+}}, eq
%rhs64 = sext i32 %rhs to i64
%tst2 = icmp ule i64 %lhs64, %rhs64
%val2 = sext i1 %tst2 to i64
store i64 %val2, i64* @var64
-; CHECK: csinv {{x[0-9]+}}, xzr, xzr, hi
+; CHECK: csetm {{x[0-9]+}}, ls
ret void
; CHECK: ret
diff --git a/test/CodeGen/AArch64/cpus.ll b/test/CodeGen/AArch64/cpus.ll
index f0b60f0..f0f36bd 100644
--- a/test/CodeGen/AArch64/cpus.ll
+++ b/test/CodeGen/AArch64/cpus.ll
@@ -1,9 +1,10 @@
; This tests that llc accepts all valid AArch64 CPUs
-; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mcpu=generic 2>&1 | FileCheck %s
-; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mcpu=cortex-a53 2>&1 | FileCheck %s
-; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mcpu=cortex-a57 2>&1 | FileCheck %s
-; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=INVALID
+
+; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=generic 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a53 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a57 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=INVALID
; CHECK-NOT: {{.*}} is not a recognized processor for this target
; INVALID: {{.*}} is not a recognized processor for this target
diff --git a/test/CodeGen/AArch64/directcond.ll b/test/CodeGen/AArch64/directcond.ll
index 12c7b6a..1b51928 100644
--- a/test/CodeGen/AArch64/directcond.ll
+++ b/test/CodeGen/AArch64/directcond.ll
@@ -1,11 +1,10 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s --check-prefix=CHECK
; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
define i32 @test_select_i32(i1 %bit, i32 %a, i32 %b) {
; CHECK-LABEL: test_select_i32:
%val = select i1 %bit, i32 %a, i32 %b
-; CHECK: movz [[ONE:w[0-9]+]], #1
-; CHECK: tst w0, [[ONE]]
+; CHECK: tst w0, #0x1
; CHECK-NEXT: csel w0, w1, w2, ne
ret i32 %val
@@ -14,8 +13,7 @@ define i32 @test_select_i32(i1 %bit, i32 %a, i32 %b) {
define i64 @test_select_i64(i1 %bit, i64 %a, i64 %b) {
; CHECK-LABEL: test_select_i64:
%val = select i1 %bit, i64 %a, i64 %b
-; CHECK: movz [[ONE:w[0-9]+]], #1
-; CHECK: tst w0, [[ONE]]
+; CHECK: tst w0, #0x1
; CHECK-NEXT: csel x0, x1, x2, ne
ret i64 %val
@@ -24,8 +22,7 @@ define i64 @test_select_i64(i1 %bit, i64 %a, i64 %b) {
define float @test_select_float(i1 %bit, float %a, float %b) {
; CHECK-LABEL: test_select_float:
%val = select i1 %bit, float %a, float %b
-; CHECK: movz [[ONE:w[0-9]+]], #1
-; CHECK: tst w0, [[ONE]]
+; CHECK: tst w0, #0x1
; CHECK-NEXT: fcsel s0, s0, s1, ne
; CHECK-NOFP-NOT: fcsel
ret float %val
@@ -34,8 +31,7 @@ define float @test_select_float(i1 %bit, float %a, float %b) {
define double @test_select_double(i1 %bit, double %a, double %b) {
; CHECK-LABEL: test_select_double:
%val = select i1 %bit, double %a, double %b
-; CHECK: movz [[ONE:w[0-9]+]], #1
-; CHECK: tst w0, [[ONE]]
+; CHECK: tst w0, #0x1
; CHECK-NEXT: fcsel d0, d0, d1, ne
; CHECK-NOFP-NOT: fcsel
@@ -45,7 +41,7 @@ define double @test_select_double(i1 %bit, double %a, double %b) {
define i32 @test_brcond(i1 %bit) {
; CHECK-LABEL: test_brcond:
br i1 %bit, label %true, label %false
-; CHECK: tbz {{w[0-9]+}}, #0, .LBB
+; CHECK: tbz {{w[0-9]+}}, #0, {{.?LBB}}
true:
ret i32 0
@@ -57,7 +53,7 @@ define i1 @test_setcc_float(float %lhs, float %rhs) {
; CHECK: test_setcc_float
%val = fcmp oeq float %lhs, %rhs
; CHECK: fcmp s0, s1
-; CHECK: csinc w0, wzr, wzr, ne
+; CHECK: cset w0, eq
; CHECK-NOFP-NOT: fcmp
ret i1 %val
}
@@ -66,7 +62,7 @@ define i1 @test_setcc_double(double %lhs, double %rhs) {
; CHECK: test_setcc_double
%val = fcmp oeq double %lhs, %rhs
; CHECK: fcmp d0, d1
-; CHECK: csinc w0, wzr, wzr, ne
+; CHECK: cset w0, eq
; CHECK-NOFP-NOT: fcmp
ret i1 %val
}
@@ -75,7 +71,7 @@ define i1 @test_setcc_i32(i32 %lhs, i32 %rhs) {
; CHECK: test_setcc_i32
%val = icmp ugt i32 %lhs, %rhs
; CHECK: cmp w0, w1
-; CHECK: csinc w0, wzr, wzr, ls
+; CHECK: cset w0, hi
ret i1 %val
}
@@ -83,6 +79,6 @@ define i1 @test_setcc_i64(i64 %lhs, i64 %rhs) {
; CHECK: test_setcc_i64
%val = icmp ne i64 %lhs, %rhs
; CHECK: cmp x0, x1
-; CHECK: csinc w0, wzr, wzr, eq
+; CHECK: cset w0, ne
ret i1 %val
}
diff --git a/test/CodeGen/AArch64/dp-3source.ll b/test/CodeGen/AArch64/dp-3source.ll
index 81d9e15..22bd4a8 100644
--- a/test/CodeGen/AArch64/dp-3source.ll
+++ b/test/CodeGen/AArch64/dp-3source.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
define i32 @test_madd32(i32 %val0, i32 %val1, i32 %val2) {
; CHECK-LABEL: test_madd32:
diff --git a/test/CodeGen/AArch64/dp1.ll b/test/CodeGen/AArch64/dp1.ll
index 6a8d55c..662b415 100644
--- a/test/CodeGen/AArch64/dp1.ll
+++ b/test/CodeGen/AArch64/dp1.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
@var32 = global i32 0
@var64 = global i64 0
diff --git a/test/CodeGen/AArch64/dp2.ll b/test/CodeGen/AArch64/dp2.ll
index 48b0701..71b3169 100644
--- a/test/CodeGen/AArch64/dp2.ll
+++ b/test/CodeGen/AArch64/dp2.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64 | FileCheck %s
@var32_0 = global i32 0
@var32_1 = global i32 0
@@ -13,7 +13,7 @@ define void @rorv_i64() {
%val3_tmp = shl i64 %val0_tmp, %val2_tmp
%val4_tmp = lshr i64 %val0_tmp, %val1_tmp
%val5_tmp = or i64 %val3_tmp, %val4_tmp
-; CHECK: ror {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: {{ror|rorv}} {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
store volatile i64 %val5_tmp, i64* @var64_0
ret void
}
@@ -23,7 +23,7 @@ define void @asrv_i64() {
%val0_tmp = load i64* @var64_0
%val1_tmp = load i64* @var64_1
%val4_tmp = ashr i64 %val0_tmp, %val1_tmp
-; CHECK: asr {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: {{asr|asrv}} {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
store volatile i64 %val4_tmp, i64* @var64_1
ret void
}
@@ -33,7 +33,7 @@ define void @lsrv_i64() {
%val0_tmp = load i64* @var64_0
%val1_tmp = load i64* @var64_1
%val4_tmp = lshr i64 %val0_tmp, %val1_tmp
-; CHECK: lsr {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: {{lsr|lsrv}} {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
store volatile i64 %val4_tmp, i64* @var64_0
ret void
}
@@ -43,7 +43,7 @@ define void @lslv_i64() {
%val0_tmp = load i64* @var64_0
%val1_tmp = load i64* @var64_1
%val4_tmp = shl i64 %val0_tmp, %val1_tmp
-; CHECK: lsl {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: {{lsl|lslv}} {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
store volatile i64 %val4_tmp, i64* @var64_1
ret void
}
@@ -75,7 +75,7 @@ define void @lsrv_i32() {
%val1_tmp = load i32* @var32_1
%val2_tmp = add i32 1, %val1_tmp
%val4_tmp = lshr i32 %val0_tmp, %val2_tmp
-; CHECK: lsr {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: {{lsr|lsrv}} {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
store volatile i32 %val4_tmp, i32* @var32_0
ret void
}
@@ -86,7 +86,7 @@ define void @lslv_i32() {
%val1_tmp = load i32* @var32_1
%val2_tmp = add i32 1, %val1_tmp
%val4_tmp = shl i32 %val0_tmp, %val2_tmp
-; CHECK: lsl {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: {{lsl|lslv}} {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
store volatile i32 %val4_tmp, i32* @var32_1
ret void
}
@@ -100,7 +100,7 @@ define void @rorv_i32() {
%val3_tmp = shl i32 %val0_tmp, %val2_tmp
%val4_tmp = lshr i32 %val0_tmp, %val1_tmp
%val5_tmp = or i32 %val3_tmp, %val4_tmp
-; CHECK: ror {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: {{ror|rorv}} {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
store volatile i32 %val5_tmp, i32* @var32_0
ret void
}
@@ -111,7 +111,7 @@ define void @asrv_i32() {
%val1_tmp = load i32* @var32_1
%val2_tmp = add i32 1, %val1_tmp
%val4_tmp = ashr i32 %val0_tmp, %val2_tmp
-; CHECK: asr {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: {{asr|asrv}} {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
store volatile i32 %val4_tmp, i32* @var32_1
ret void
}
@@ -143,7 +143,7 @@ define i32 @test_lsl32() {
%val = load i32* @var32_0
%ret = shl i32 1, %val
-; CHECK: lsl {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: {{lsl|lslv}} {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
ret i32 %ret
}
@@ -153,7 +153,7 @@ define i32 @test_lsr32() {
%val = load i32* @var32_0
%ret = lshr i32 1, %val
-; CHECK: lsr {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: {{lsr|lsrv}} {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
ret i32 %ret
}
@@ -163,7 +163,7 @@ define i32 @test_asr32(i32 %in) {
%val = load i32* @var32_0
%ret = ashr i32 %in, %val
-; CHECK: asr {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK: {{asr|asrv}} {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
ret i32 %ret
}
diff --git a/test/CodeGen/AArch64/eliminate-trunc.ll b/test/CodeGen/AArch64/eliminate-trunc.ll
new file mode 100644
index 0000000..ea86a08
--- /dev/null
+++ b/test/CodeGen/AArch64/eliminate-trunc.ll
@@ -0,0 +1,39 @@
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-apple-ios7.0 -mcpu=cyclone | FileCheck %s
+
+; Check trunc i64 operation is translated as a subregister access
+; eliminating an i32 induction varible.
+
+; CHECK-NOT: add {{x[0-9]+}}, {{x[0-9]+}}, #1
+; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, #1
+; CHECK-NEXT: cmp {{w[0-9]+}}, {{w[0-9]+}}
+define void @test1_signed([8 x i8]* nocapture %a, i8* nocapture readonly %box, i8 %limit) minsize {
+entry:
+ %conv = zext i8 %limit to i32
+ %cmp223 = icmp eq i8 %limit, 0
+ br i1 %cmp223, label %for.end15, label %for.body4.lr.ph.us
+
+for.body4.us:
+ %indvars.iv = phi i64 [ 0, %for.body4.lr.ph.us ], [ %indvars.iv.next, %for.body4.us ]
+ %arrayidx6.us = getelementptr inbounds [8 x i8]* %a, i64 %indvars.iv26, i64 %indvars.iv
+ %0 = load i8* %arrayidx6.us, align 1
+ %idxprom7.us = zext i8 %0 to i64
+ %arrayidx8.us = getelementptr inbounds i8* %box, i64 %idxprom7.us
+ %1 = load i8* %arrayidx8.us, align 1
+ store i8 %1, i8* %arrayidx6.us, align 1
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %2 = trunc i64 %indvars.iv.next to i32
+ %cmp2.us = icmp slt i32 %2, %conv
+ br i1 %cmp2.us, label %for.body4.us, label %for.cond1.for.inc13_crit_edge.us
+
+for.body4.lr.ph.us:
+ %indvars.iv26 = phi i64 [ %indvars.iv.next27, %for.cond1.for.inc13_crit_edge.us ], [ 0, %entry ]
+ br label %for.body4.us
+
+for.cond1.for.inc13_crit_edge.us:
+ %indvars.iv.next27 = add nuw nsw i64 %indvars.iv26, 1
+ %exitcond28 = icmp eq i64 %indvars.iv26, 3
+ br i1 %exitcond28, label %for.end15, label %for.body4.lr.ph.us
+
+for.end15:
+ ret void
+}
diff --git a/test/CodeGen/AArch64/extern-weak.ll b/test/CodeGen/AArch64/extern-weak.ll
index 322b3f4..ce5c0f6 100644
--- a/test/CodeGen/AArch64/extern-weak.ll
+++ b/test/CodeGen/AArch64/extern-weak.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -o - < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-none-linux-gnu -code-model=large -o - < %s | FileCheck --check-prefix=CHECK-LARGE %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -code-model=large -o - %s | FileCheck --check-prefix=CHECK-LARGE %s
declare extern_weak i32 @var()
@@ -7,10 +7,10 @@ define i32()* @foo() {
; The usual ADRP/ADD pair can't be used for a weak reference because it must
; evaluate to 0 if the symbol is undefined. We use a litpool entry.
ret i32()* @var
-; CHECK: .LCPI0_0:
-; CHECK-NEXT: .xword var
-; CHECK: ldr x0, [{{x[0-9]+}}, #:lo12:.LCPI0_0]
+
+; CHECK: adrp x[[ADDRHI:[0-9]+]], :got:var
+; CHECK: ldr x0, [x[[ADDRHI]], :got_lo12:var]
; In the large model, the usual relocations are absolute and can
; materialise 0.
@@ -25,27 +25,29 @@ define i32()* @foo() {
define i32* @bar() {
%addr = getelementptr [10 x i32]* @arr_var, i32 0, i32 5
-; CHECK: .LCPI1_0:
-; CHECK-NEXT: .xword arr_var
-; CHECK: ldr [[BASE:x[0-9]+]], [{{x[0-9]+}}, #:lo12:.LCPI1_0]
+
+; CHECK: adrp x[[ADDRHI:[0-9]+]], :got:arr_var
+; CHECK: ldr [[BASE:x[0-9]+]], [x[[ADDRHI]], :got_lo12:arr_var]
; CHECK: add x0, [[BASE]], #20
+
ret i32* %addr
; In the large model, the usual relocations are absolute and can
; materialise 0.
-; CHECK-LARGE: movz x0, #:abs_g3:arr_var
-; CHECK-LARGE: movk x0, #:abs_g2_nc:arr_var
-; CHECK-LARGE: movk x0, #:abs_g1_nc:arr_var
-; CHECK-LARGE: movk x0, #:abs_g0_nc:arr_var
+; CHECK-LARGE: movz [[ADDR:x[0-9]+]], #:abs_g3:arr_var
+; CHECK-LARGE: movk [[ADDR]], #:abs_g2_nc:arr_var
+; CHECK-LARGE: movk [[ADDR]], #:abs_g1_nc:arr_var
+; CHECK-LARGE: movk [[ADDR]], #:abs_g0_nc:arr_var
}
@defined_weak_var = internal unnamed_addr global i32 0
define i32* @wibble() {
ret i32* @defined_weak_var
+
; CHECK: adrp [[BASE:x[0-9]+]], defined_weak_var
-; CHECK: add x0, [[BASE]], #:lo12:defined_weak_var
+; CHECK: add x0, [[BASE]], :lo12:defined_weak_var
; CHECK-LARGE: movz x0, #:abs_g3:defined_weak_var
; CHECK-LARGE: movk x0, #:abs_g2_nc:defined_weak_var
diff --git a/test/CodeGen/AArch64/extract.ll b/test/CodeGen/AArch64/extract.ll
index 62d9ed2..1fc9387 100644
--- a/test/CodeGen/AArch64/extract.ll
+++ b/test/CodeGen/AArch64/extract.ll
@@ -1,11 +1,11 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
define i64 @ror_i64(i64 %in) {
; CHECK-LABEL: ror_i64:
%left = shl i64 %in, 19
%right = lshr i64 %in, 45
%val5 = or i64 %left, %right
-; CHECK: extr {{x[0-9]+}}, x0, x0, #45
+; CHECK: ror {{x[0-9]+}}, x0, #45
ret i64 %val5
}
@@ -14,7 +14,7 @@ define i32 @ror_i32(i32 %in) {
%left = shl i32 %in, 9
%right = lshr i32 %in, 23
%val5 = or i32 %left, %right
-; CHECK: extr {{w[0-9]+}}, w0, w0, #23
+; CHECK: ror {{w[0-9]+}}, w0, #23
ret i32 %val5
}
diff --git a/test/CodeGen/AArch64/fastcc-reserved.ll b/test/CodeGen/AArch64/fastcc-reserved.ll
index c6c0505..a392619 100644
--- a/test/CodeGen/AArch64/fastcc-reserved.ll
+++ b/test/CodeGen/AArch64/fastcc-reserved.ll
@@ -12,8 +12,8 @@ define fastcc void @foo(i32 %in) {
%addr = alloca i8, i32 %in
; Normal frame setup stuff:
-; CHECK: sub sp, sp,
-; CHECK: stp x29, x30
+; CHECK: stp x29, x30, [sp, #-16]!
+; CHECK: mov x29, sp
; Reserve space for call-frame:
; CHECK: sub sp, sp, #16
@@ -26,8 +26,8 @@ define fastcc void @foo(i32 %in) {
; CHECK-NOT: sub sp, sp, #16
; CHECK-NOT: add sp, sp,
-; CHECK: ldp x29, x30
-; CHECK: add sp, sp,
+; CHECK: mov sp, x29
+; CHECK: ldp x29, x30, [sp], #16
ret void
}
@@ -38,8 +38,8 @@ define void @foo1(i32 %in) {
%addr = alloca i8, i32 %in
; Normal frame setup again
-; CHECK: sub sp, sp,
-; CHECK: stp x29, x30
+; CHECK: stp x29, x30, [sp, #-16]!
+; CHECK: mov x29, sp
; Reserve space for call-frame
; CHECK: sub sp, sp, #16
@@ -52,7 +52,7 @@ define void @foo1(i32 %in) {
; Check for epilogue (primarily to make sure sp spotted above wasn't
; part of it).
-; CHECK: ldp x29, x30
-; CHECK: add sp, sp,
+; CHECK: mov sp, x29
+; CHECK: ldp x29, x30, [sp], #16
ret void
}
diff --git a/test/CodeGen/AArch64/fastcc.ll b/test/CodeGen/AArch64/fastcc.ll
index a4cd378..9917fcd 100644
--- a/test/CodeGen/AArch64/fastcc.ll
+++ b/test/CodeGen/AArch64/fastcc.ll
@@ -6,10 +6,13 @@
define fastcc void @func_stack0() {
; CHECK-LABEL: func_stack0:
-; CHECK: sub sp, sp, #48
+; CHECK: mov x29, sp
+; CHECK-NEXT: sub sp, sp, #32
; CHECK-TAIL-LABEL: func_stack0:
-; CHECK-TAIL: sub sp, sp, #48
+; CHECK-TAIL: stp x29, x30, [sp, #-16]!
+; CHECK-TAIL-NEXT: mov x29, sp
+; CHECK-TAIL-NEXT: sub sp, sp, #32
call fastcc void @func_stack8([8 x i32] undef, i32 42)
@@ -24,6 +27,7 @@ define fastcc void @func_stack0() {
; CHECK: bl func_stack32
; CHECK-NOT: sub sp, sp,
+
; CHECK-TAIL: bl func_stack32
; CHECK-TAIL: sub sp, sp, #32
@@ -32,30 +36,39 @@ define fastcc void @func_stack0() {
; CHECK: bl func_stack0
; CHECK-NOT: sub sp, sp
+
; CHECK-TAIL: bl func_stack0
; CHECK-TAIL-NOT: sub sp, sp
ret void
-; CHECK: add sp, sp, #48
+; CHECK: mov sp, x29
+; CHECK-NEXT: ldp x29, x30, [sp], #16
; CHECK-NEXT: ret
-; CHECK-TAIL: add sp, sp, #48
-; CHECK-TAIL-NEXT: ret
+; CHECK-TAIL: mov sp, x29
+; CHECK-TAIL-NEXT: ldp x29, x30, [sp], #16
+; CHECK-TAIL-NEXT: ret
}
define fastcc void @func_stack8([8 x i32], i32 %stacked) {
; CHECK-LABEL: func_stack8:
-; CHECK: sub sp, sp, #48
+; CHECK: stp x29, x30, [sp, #-16]!
+; CHECK: mov x29, sp
+; CHECK: sub sp, sp, #32
+
; CHECK-TAIL-LABEL: func_stack8:
-; CHECK-TAIL: sub sp, sp, #48
+; CHECK-TAIL: stp x29, x30, [sp, #-16]!
+; CHECK-TAIL: mov x29, sp
+; CHECK-TAIL: sub sp, sp, #32
call fastcc void @func_stack8([8 x i32] undef, i32 42)
; CHECK: bl func_stack8
; CHECK-NOT: sub sp, sp,
+
; CHECK-TAIL: bl func_stack8
; CHECK-TAIL: sub sp, sp, #16
@@ -64,6 +77,7 @@ define fastcc void @func_stack8([8 x i32], i32 %stacked) {
; CHECK: bl func_stack32
; CHECK-NOT: sub sp, sp,
+
; CHECK-TAIL: bl func_stack32
; CHECK-TAIL: sub sp, sp, #32
@@ -76,19 +90,22 @@ define fastcc void @func_stack8([8 x i32], i32 %stacked) {
; CHECK-TAIL-NOT: sub sp, sp
ret void
-; CHECK: add sp, sp, #48
+; CHECK: mov sp, x29
+; CHECK-NEXT: ldp x29, x30, [sp], #16
; CHECK-NEXT: ret
-; CHECK-TAIL: add sp, sp, #64
+
+; CHECK-TAIL: mov sp, x29
+; CHECK-TAIL-NEXT: ldp x29, x30, [sp], #16
; CHECK-TAIL-NEXT: ret
}
define fastcc void @func_stack32([8 x i32], i128 %stacked0, i128 %stacked1) {
; CHECK-LABEL: func_stack32:
-; CHECK: sub sp, sp, #48
+; CHECK: mov x29, sp
; CHECK-TAIL-LABEL: func_stack32:
-; CHECK-TAIL: sub sp, sp, #48
+; CHECK-TAIL: mov x29, sp
call fastcc void @func_stack8([8 x i32] undef, i32 42)
@@ -103,6 +120,7 @@ define fastcc void @func_stack32([8 x i32], i128 %stacked0, i128 %stacked1) {
; CHECK: bl func_stack32
; CHECK-NOT: sub sp, sp,
+
; CHECK-TAIL: bl func_stack32
; CHECK-TAIL: sub sp, sp, #32
@@ -111,13 +129,16 @@ define fastcc void @func_stack32([8 x i32], i128 %stacked0, i128 %stacked1) {
; CHECK: bl func_stack0
; CHECK-NOT: sub sp, sp
+
; CHECK-TAIL: bl func_stack0
; CHECK-TAIL-NOT: sub sp, sp
ret void
-; CHECK: add sp, sp, #48
+; CHECK: mov sp, x29
+; CHECK-NEXT: ldp x29, x30, [sp], #16
; CHECK-NEXT: ret
-; CHECK-TAIL: add sp, sp, #80
+; CHECK-TAIL: mov sp, x29
+; CHECK-TAIL-NEXT: ldp x29, x30, [sp], #16
; CHECK-TAIL-NEXT: ret
}
diff --git a/test/CodeGen/AArch64/fcmp.ll b/test/CodeGen/AArch64/fcmp.ll
index a9518ea..3c74508 100644
--- a/test/CodeGen/AArch64/fcmp.ll
+++ b/test/CodeGen/AArch64/fcmp.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
declare void @bar(i32)
diff --git a/test/CodeGen/AArch64/fcvt-fixed.ll b/test/CodeGen/AArch64/fcvt-fixed.ll
index 9d66da4..ccb3616 100644
--- a/test/CodeGen/AArch64/fcvt-fixed.ll
+++ b/test/CodeGen/AArch64/fcvt-fixed.ll
@@ -1,4 +1,8 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -O0 | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-apple-ios7.0 -O0
+
+; (The O0 test is to make sure FastISel still constrains its operands properly
+; and the verifier doesn't trigger).
@var32 = global i32 0
@var64 = global i64 0
diff --git a/test/CodeGen/AArch64/fcvt-int.ll b/test/CodeGen/AArch64/fcvt-int.ll
index 97427a7..d549c7e 100644
--- a/test/CodeGen/AArch64/fcvt-int.ll
+++ b/test/CodeGen/AArch64/fcvt-int.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
define i32 @test_floattoi32(float %in) {
; CHECK-LABEL: test_floattoi32:
diff --git a/test/CodeGen/AArch64/flags-multiuse.ll b/test/CodeGen/AArch64/flags-multiuse.ll
index e99c728..c9b0b9f 100644
--- a/test/CodeGen/AArch64/flags-multiuse.ll
+++ b/test/CodeGen/AArch64/flags-multiuse.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
; LLVM should be able to cope with multiple uses of the same flag-setting
; instruction at different points of a routine. Either by rematerializing the
@@ -15,7 +15,7 @@ define i32 @test_multiflag(i32 %n, i32 %m, i32 %o) {
; CHECK: cmp [[LHS:w[0-9]+]], [[RHS:w[0-9]+]]
%val = zext i1 %test to i32
-; CHECK: csinc {{[xw][0-9]+}}, {{xzr|wzr}}, {{xzr|wzr}}, eq
+; CHECK: cset {{[xw][0-9]+}}, ne
store i32 %val, i32* @var
diff --git a/test/CodeGen/AArch64/floatdp_1source.ll b/test/CodeGen/AArch64/floatdp_1source.ll
index 3d7f8f0..8c02787 100644
--- a/test/CodeGen/AArch64/floatdp_1source.ll
+++ b/test/CodeGen/AArch64/floatdp_1source.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
@varhalf = global half 0.0
@varfloat = global float 0.0
diff --git a/test/CodeGen/AArch64/floatdp_2source.ll b/test/CodeGen/AArch64/floatdp_2source.ll
index bb65528..2622717 100644
--- a/test/CodeGen/AArch64/floatdp_2source.ll
+++ b/test/CodeGen/AArch64/floatdp_2source.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu -mcpu=cyclone | FileCheck %s
@varfloat = global float 0.0
@vardouble = global double 0.0
diff --git a/test/CodeGen/AArch64/fp-cond-sel.ll b/test/CodeGen/AArch64/fp-cond-sel.ll
index 572f42e..b4f4d77 100644
--- a/test/CodeGen/AArch64/fp-cond-sel.ll
+++ b/test/CodeGen/AArch64/fp-cond-sel.ll
@@ -1,25 +1,34 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -mcpu=cyclone | FileCheck %s --check-prefix=CHECK
@varfloat = global float 0.0
@vardouble = global double 0.0
+declare void @use_float(float)
+declare void @use_double(double)
+
define void @test_csel(i32 %lhs32, i32 %rhs32, i64 %lhs64) {
; CHECK-LABEL: test_csel:
%tst1 = icmp ugt i32 %lhs32, %rhs32
%val1 = select i1 %tst1, float 0.0, float 1.0
store float %val1, float* @varfloat
-; CHECK: ldr [[FLT0:s[0-9]+]], [{{x[0-9]+}}, #:lo12:.LCPI
-; CHECK: fmov [[FLT1:s[0-9]+]], #1.0
-; CHECK: fcsel {{s[0-9]+}}, [[FLT0]], [[FLT1]], hi
+; CHECK: movi v[[FLT0:[0-9]+]].2d, #0
+; CHECK: fmov s[[FLT1:[0-9]+]], #1.0
+; CHECK: fcsel {{s[0-9]+}}, s[[FLT0]], s[[FLT1]], hi
%rhs64 = sext i32 %rhs32 to i64
%tst2 = icmp sle i64 %lhs64, %rhs64
%val2 = select i1 %tst2, double 1.0, double 0.0
store double %val2, double* @vardouble
-; CHECK: ldr [[FLT0:d[0-9]+]], [{{x[0-9]+}}, #:lo12:.LCPI
-; CHECK: fmov [[FLT1:d[0-9]+]], #1.0
-; CHECK: fcsel {{d[0-9]+}}, [[FLT1]], [[FLT0]], le
+; FLT0 is reused from above on ARM64.
+; CHECK: fmov d[[FLT1:[0-9]+]], #1.0
+; CHECK: fcsel {{d[0-9]+}}, d[[FLT1]], d[[FLT0]], le
+
+ call void @use_float(float 0.0)
+ call void @use_float(float 1.0)
+
+ call void @use_double(double 0.0)
+ call void @use_double(double 1.0)
ret void
; CHECK: ret
diff --git a/test/CodeGen/AArch64/fp-dp3.ll b/test/CodeGen/AArch64/fp-dp3.ll
index 2a6790e..10f88fd 100644
--- a/test/CodeGen/AArch64/fp-dp3.ll
+++ b/test/CodeGen/AArch64/fp-dp3.ll
@@ -1,5 +1,5 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -fp-contract=fast | FileCheck %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s -check-prefix=CHECK-NOFAST
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -fp-contract=fast | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s -check-prefix=CHECK-NOFAST
declare float @llvm.fma.f32(float, float, float)
declare double @llvm.fma.f64(double, double, double)
diff --git a/test/CodeGen/AArch64/fp128-folding.ll b/test/CodeGen/AArch64/fp128-folding.ll
index b1c560d..892b19c 100644
--- a/test/CodeGen/AArch64/fp128-folding.ll
+++ b/test/CodeGen/AArch64/fp128-folding.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
declare void @bar(i8*, i8*, i32*)
; SelectionDAG used to try to fold some fp128 operations using the ppc128 type,
@@ -12,6 +12,6 @@ define fp128 @test_folding() {
%fpval = sitofp i32 %val to fp128
; If the value is loaded from a constant pool into an fp128, it's been folded
; successfully.
-; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.LCPI
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.LCPI
ret fp128 %fpval
}
diff --git a/test/CodeGen/AArch64/fpimm.ll b/test/CodeGen/AArch64/fpimm.ll
index b8f7169..e59520c 100644
--- a/test/CodeGen/AArch64/fpimm.ll
+++ b/test/CodeGen/AArch64/fpimm.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
@varf32 = global float 0.0
@varf64 = global double 0.0
@@ -13,7 +13,7 @@ define void @check_float() {
%newval2 = fadd float %val, 128.0
store volatile float %newval2, float* @varf32
-; CHECK-DAG: ldr [[HARD:s[0-9]+]], [{{x[0-9]+}}, #:lo12:.LCPI0_0
+; CHECK-DAG: ldr [[HARD:s[0-9]+]], [{{x[0-9]+}}, {{#?}}:lo12:.LCPI0_0
; CHECK: ret
ret void
@@ -29,7 +29,7 @@ define void @check_double() {
%newval2 = fadd double %val, 128.0
store volatile double %newval2, double* @varf64
-; CHECK-DAG: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.LCPI1_0
+; CHECK-DAG: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.LCPI1_0
; CHECK: ret
ret void
diff --git a/test/CodeGen/AArch64/frameaddr.ll b/test/CodeGen/AArch64/frameaddr.ll
index 182704b..85d95e2 100644
--- a/test/CodeGen/AArch64/frameaddr.ll
+++ b/test/CodeGen/AArch64/frameaddr.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
define i8* @t() nounwind {
entry:
@@ -12,7 +12,7 @@ define i8* @t2() nounwind {
entry:
; CHECK-LABEL: t2:
; CHECK: ldr x[[reg:[0-9]+]], [x29]
-; CHECK: ldr x[[reg]], [x[[reg]]]
+; CHECK: ldr {{x[0-9]+}}, [x[[reg]]]
%0 = call i8* @llvm.frameaddress(i32 2)
ret i8* %0
}
diff --git a/test/CodeGen/AArch64/free-zext.ll b/test/CodeGen/AArch64/free-zext.ll
new file mode 100644
index 0000000..d69105e
--- /dev/null
+++ b/test/CodeGen/AArch64/free-zext.ll
@@ -0,0 +1,14 @@
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
+
+define i64 @test_free_zext(i8* %a, i16* %b) {
+; CHECK-LABEL: test_free_zext
+; CHECK-DAG: ldrb w[[A:[0-9]+]], [x0]
+; CHECK: ldrh w[[B:[0-9]+]], [x1]
+; CHECK: add x0, x[[B]], x[[A]]
+ %1 = load i8* %a, align 1
+ %conv = zext i8 %1 to i64
+ %2 = load i16* %b, align 2
+ %conv1 = zext i16 %2 to i64
+ %add = add nsw i64 %conv1, %conv
+ ret i64 %add
+}
diff --git a/test/CodeGen/AArch64/func-argpassing.ll b/test/CodeGen/AArch64/func-argpassing.ll
index f307686..abb732c 100644
--- a/test/CodeGen/AArch64/func-argpassing.ll
+++ b/test/CodeGen/AArch64/func-argpassing.ll
@@ -1,7 +1,5 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-LE %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck --check-prefix=CHECK %s
; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-BE %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
%myStruct = type { i64 , i8, i32 }
@@ -18,7 +16,7 @@ define void @take_i8s(i8 %val1, i8 %val2) {
store i8 %val2, i8* @var8
; Not using w1 may be technically allowed, but it would indicate a
; problem in itself.
-; CHECK: strb w1, [{{x[0-9]+}}, #:lo12:var8]
+; CHECK: strb w1, [{{x[0-9]+}}, {{#?}}:lo12:var8]
ret void
}
@@ -28,7 +26,7 @@ define void @add_floats(float %val1, float %val2) {
; CHECK: fadd [[ADDRES:s[0-9]+]], s0, s1
; CHECK-NOFP-NOT: fadd
store float %newval, float* @varfloat
-; CHECK: str [[ADDRES]], [{{x[0-9]+}}, #:lo12:varfloat]
+; CHECK: str [[ADDRES]], [{{x[0-9]+}}, {{#?}}:lo12:varfloat]
ret void
}
@@ -43,12 +41,12 @@ define void @take_struct(%myStruct* byval %structval) {
; Some weird move means x0 is used for one access
; CHECK: ldr [[REG32:w[0-9]+]], [{{x[0-9]+|sp}}, #12]
store volatile i32 %val0, i32* @var32
-; CHECK: str [[REG32]], [{{x[0-9]+}}, #:lo12:var32]
+; CHECK: str [[REG32]], [{{x[0-9]+}}, {{#?}}:lo12:var32]
%val1 = load volatile i64* %addr1
; CHECK: ldr [[REG64:x[0-9]+]], [{{x[0-9]+|sp}}]
store volatile i64 %val1, i64* @var64
-; CHECK: str [[REG64]], [{{x[0-9]+}}, #:lo12:var64]
+; CHECK: str [[REG64]], [{{x[0-9]+}}, {{#?}}:lo12:var64]
ret void
}
@@ -62,15 +60,14 @@ define void @check_byval_align(i32* byval %ignore, %myStruct* byval align 16 %st
%val0 = load volatile i32* %addr0
; Some weird move means x0 is used for one access
-; CHECK: add x[[STRUCTVAL_ADDR:[0-9]+]], sp, #16
-; CHECK: ldr [[REG32:w[0-9]+]], [x[[STRUCTVAL_ADDR]], #12]
+; CHECK: ldr [[REG32:w[0-9]+]], [sp, #28]
store i32 %val0, i32* @var32
-; CHECK: str [[REG32]], [{{x[0-9]+}}, #:lo12:var32]
+; CHECK: str [[REG32]], [{{x[0-9]+}}, {{#?}}:lo12:var32]
%val1 = load volatile i64* %addr1
; CHECK: ldr [[REG64:x[0-9]+]], [sp, #16]
store i64 %val1, i64* @var64
-; CHECK: str [[REG64]], [{{x[0-9]+}}, #:lo12:var64]
+; CHECK: str [[REG64]], [{{x[0-9]+}}, {{#?}}:lo12:var64]
ret void
}
@@ -79,7 +76,7 @@ define i32 @return_int() {
; CHECK-LABEL: return_int:
%val = load i32* @var32
ret i32 %val
-; CHECK: ldr w0, [{{x[0-9]+}}, #:lo12:var32]
+; CHECK: ldr w0, [{{x[0-9]+}}, {{#?}}:lo12:var32]
; Make sure epilogue follows
; CHECK-NEXT: ret
}
@@ -87,7 +84,7 @@ define i32 @return_int() {
define double @return_double() {
; CHECK-LABEL: return_double:
ret double 3.14
-; CHECK: ldr d0, [{{x[0-9]+}}, #:lo12:.LCPI
+; CHECK: ldr d0, [{{x[0-9]+}}, {{#?}}:lo12:.LCPI
; CHECK-NOFP-NOT: ldr d0,
}
@@ -99,10 +96,10 @@ define [2 x i64] @return_struct() {
%addr = bitcast %myStruct* @varstruct to [2 x i64]*
%val = load [2 x i64]* %addr
ret [2 x i64] %val
-; CHECK: ldr x0, [{{x[0-9]+}}, #:lo12:varstruct]
+; CHECK-DAG: ldr x0, [{{x[0-9]+}}, {{#?}}:lo12:varstruct]
; Odd register regex below disallows x0 which we want to be live now.
-; CHECK: add {{x[1-9][0-9]*}}, {{x[1-9][0-9]*}}, #:lo12:varstruct
-; CHECK-NEXT: ldr x1, [{{x[1-9][0-9]*}}, #8]
+; CHECK-DAG: add {{x[1-9][0-9]*}}, {{x[1-9][0-9]*}}, {{#?}}:lo12:varstruct
+; CHECK: ldr x1, [{{x[1-9][0-9]*}}, #8]
; Make sure epilogue immediately follows
; CHECK-NEXT: ret
}
@@ -139,17 +136,16 @@ define i32 @struct_on_stack(i8 %var0, i16 %var1, i32 %var2, i64 %var3, i128 %var
store volatile i64 %val64, i64* @var64
; Currently nothing on local stack, so struct should be at sp
; CHECK: ldr [[VAL64:x[0-9]+]], [sp]
-; CHECK: str [[VAL64]], [{{x[0-9]+}}, #:lo12:var64]
+; CHECK: str [[VAL64]], [{{x[0-9]+}}, {{#?}}:lo12:var64]
store volatile double %notstacked, double* @vardouble
; CHECK-NOT: ldr d0
-; CHECK: str d0, [{{x[0-9]+}}, #:lo12:vardouble
+; CHECK: str d0, [{{x[0-9]+}}, {{#?}}:lo12:vardouble
; CHECK-NOFP-NOT: str d0,
%retval = load volatile i32* %stacked
ret i32 %retval
; CHECK-LE: ldr w0, [sp, #16]
-; CHECK-BE: ldr w0, [sp, #20]
}
define void @stacked_fpu(float %var0, double %var1, float %var2, float %var3,
@@ -159,36 +155,36 @@ define void @stacked_fpu(float %var0, double %var1, float %var2, float %var3,
store float %var8, float* @varfloat
; Beware as above: the offset would be different on big-endian
; machines if the first ldr were changed to use s-registers.
-; CHECK: ldr d[[VALFLOAT:[0-9]+]], [sp]
-; CHECK: str s[[VALFLOAT]], [{{x[0-9]+}}, #:lo12:varfloat]
+; CHECK: ldr {{[ds]}}[[VALFLOAT:[0-9]+]], [sp]
+; CHECK: str s[[VALFLOAT]], [{{x[0-9]+}}, {{#?}}:lo12:varfloat]
ret void
}
; 128-bit integer types should be passed in xEVEN, xODD rather than
; the reverse. In this case x2 and x3. Nothing should use x1.
-define i32 @check_i128_regalign(i32 %val0, i128 %val1, i32 %val2) {
-; CHECK: check_i128_regalign
+define i64 @check_i128_regalign(i32 %val0, i128 %val1, i64 %val2) {
+; CHECK-LABEL: check_i128_regalign
store i128 %val1, i128* @var128
-; CHECK: str x2, [{{x[0-9]+}}, #:lo12:var128]
-; CHECK: str x3, [{{x[0-9]+}}, #8]
+; CHECK-DAG: str x2, [{{x[0-9]+}}, {{#?}}:lo12:var128]
+; CHECK-DAG: str x3, [{{x[0-9]+}}, #8]
- ret i32 %val2
+ ret i64 %val2
; CHECK: mov x0, x4
}
define void @check_i128_stackalign(i32 %val0, i32 %val1, i32 %val2, i32 %val3,
i32 %val4, i32 %val5, i32 %val6, i32 %val7,
i32 %stack1, i128 %stack2) {
-; CHECK: check_i128_stackalign
+; CHECK-LABEL: check_i128_stackalign
store i128 %stack2, i128* @var128
; Nothing local on stack in current codegen, so first stack is 16 away
; CHECK-LE: add x[[REG:[0-9]+]], sp, #16
; CHECK-LE: ldr {{x[0-9]+}}, [x[[REG]], #8]
-; CHECK-BE: ldr {{x[0-9]+}}, [sp, #24]
; Important point is that we address sp+24 for second dword
-; CHECK: ldr {{x[0-9]+}}, [sp, #16]
+
+; CHECK: ldp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
ret void
}
@@ -200,3 +196,13 @@ define i32 @test_extern() {
; CHECK: bl memcpy
ret i32 0
}
+
+
+; A sub-i32 stack argument must be loaded on big endian with ldr{h,b}, not just
+; implicitly extended to a 32-bit load.
+define i16 @stacked_i16(i32 %val0, i32 %val1, i32 %val2, i32 %val3,
+ i32 %val4, i32 %val5, i32 %val6, i32 %val7,
+ i16 %stack1) {
+; CHECK-LABEL: stacked_i16
+ ret i16 %stack1
+}
diff --git a/test/CodeGen/AArch64/func-calls.ll b/test/CodeGen/AArch64/func-calls.ll
index f029bf2..422c576 100644
--- a/test/CodeGen/AArch64/func-calls.ll
+++ b/test/CodeGen/AArch64/func-calls.ll
@@ -1,7 +1,7 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-BE %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s --check-prefix=CHECK
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-neon | FileCheck --check-prefix=CHECK-NONEON %s
; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-BE --check-prefix=CHECK-NOFP %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=arm64_be-none-linux-gnu | FileCheck --check-prefix=CHECK-BE %s
%myStruct = type { i64 , i8, i32 }
@@ -24,15 +24,15 @@ define void @simple_args() {
%char1 = load i8* @var8
%char2 = load i8* @var8_2
call void @take_i8s(i8 %char1, i8 %char2)
-; CHECK-DAG: ldrb w0, [{{x[0-9]+}}, #:lo12:var8]
-; CHECK-DAG: ldrb w1, [{{x[0-9]+}}, #:lo12:var8_2]
+; CHECK-DAG: ldrb w0, [{{x[0-9]+}}, {{#?}}:lo12:var8]
+; CHECK-DAG: ldrb w1, [{{x[0-9]+}}, {{#?}}:lo12:var8_2]
; CHECK: bl take_i8s
%float1 = load float* @varfloat
%float2 = load float* @varfloat_2
call void @take_floats(float %float1, float %float2)
-; CHECK-DAG: ldr s1, [{{x[0-9]+}}, #:lo12:varfloat_2]
-; CHECK-DAG: ldr s0, [{{x[0-9]+}}, #:lo12:varfloat]
+; CHECK-DAG: ldr s1, [{{x[0-9]+}}, {{#?}}:lo12:varfloat_2]
+; CHECK-DAG: ldr s0, [{{x[0-9]+}}, {{#?}}:lo12:varfloat]
; CHECK: bl take_floats
; CHECK-NOFP-NOT: ldr s1,
; CHECK-NOFP-NOT: ldr s0,
@@ -51,22 +51,22 @@ define void @simple_rets() {
%int = call i32 @return_int()
store i32 %int, i32* @var32
; CHECK: bl return_int
-; CHECK: str w0, [{{x[0-9]+}}, #:lo12:var32]
+; CHECK: str w0, [{{x[0-9]+}}, {{#?}}:lo12:var32]
%dbl = call double @return_double()
store double %dbl, double* @vardouble
; CHECK: bl return_double
-; CHECK: str d0, [{{x[0-9]+}}, #:lo12:vardouble]
+; CHECK: str d0, [{{x[0-9]+}}, {{#?}}:lo12:vardouble]
; CHECK-NOFP-NOT: str d0,
%arr = call [2 x i64] @return_smallstruct()
store [2 x i64] %arr, [2 x i64]* @varsmallstruct
; CHECK: bl return_smallstruct
; CHECK: str x1, [{{x[0-9]+}}, #8]
-; CHECK: str x0, [{{x[0-9]+}}, #:lo12:varsmallstruct]
+; CHECK: str x0, [{{x[0-9]+}}, {{#?}}:lo12:varsmallstruct]
call void @return_large_struct(%myStruct* sret @varstruct)
-; CHECK: add x8, {{x[0-9]+}}, #:lo12:varstruct
+; CHECK: add x8, {{x[0-9]+}}, {{#?}}:lo12:varstruct
; CHECK: bl return_large_struct
ret void
@@ -88,19 +88,28 @@ define void @check_stack_args() {
; Want to check that the final double is passed in registers and
; that varstruct is passed on the stack. Rather dependent on how a
; memcpy gets created, but the following works for now.
-; CHECK: mov x[[SPREG:[0-9]+]], sp
-; CHECK-DAG: str {{w[0-9]+}}, [x[[SPREG]]]
-; CHECK-DAG: str {{w[0-9]+}}, [x[[SPREG]], #12]
-; CHECK-DAG: fmov d0,
+
+; CHECK-DAG: str {{q[0-9]+}}, [sp]
+; CHECK-DAG: fmov d[[FINAL_DOUBLE:[0-9]+]], #1.0
+; CHECK: mov v0.16b, v[[FINAL_DOUBLE]].16b
+
+; CHECK-NONEON-DAG: str {{q[0-9]+}}, [sp]
+; CHECK-NONEON-DAG: fmov d[[FINAL_DOUBLE:[0-9]+]], #1.0
+; CHECK-NONEON: fmov d0, d[[FINAL_DOUBLE]]
+
; CHECK: bl struct_on_stack
; CHECK-NOFP-NOT: fmov
call void @stacked_fpu(float -1.0, double 1.0, float 4.0, float 2.0,
float -2.0, float -8.0, float 16.0, float 1.0,
float 64.0)
-; CHECK: ldr s[[STACKEDREG:[0-9]+]], [{{x[0-9]+}}, #:lo12:.LCPI
-; CHECK: mov x0, sp
-; CHECK: str d[[STACKEDREG]], [x0]
+
+; CHECK: movz [[SIXTY_FOUR:w[0-9]+]], #0x4280, lsl #16
+; CHECK: str [[SIXTY_FOUR]], [sp]
+
+; CHECK-NONEON: movz [[SIXTY_FOUR:w[0-9]+]], #0x4280, lsl #16
+; CHECK-NONEON: str [[SIXTY_FOUR]], [sp]
+
; CHECK: bl stacked_fpu
ret void
}
@@ -119,18 +128,20 @@ define void @check_i128_align() {
call void @check_i128_stackalign(i32 0, i32 1, i32 2, i32 3,
i32 4, i32 5, i32 6, i32 7,
i32 42, i128 %val)
-; CHECK: ldr [[I128LO:x[0-9]+]], [{{x[0-9]+}}, #:lo12:var128]
+; CHECK: ldr [[I128LO:x[0-9]+]], [{{x[0-9]+}}, {{#?}}:lo12:var128]
; CHECK: ldr [[I128HI:x[0-9]+]], [{{x[0-9]+}}, #8]
-; CHECK: mov x[[SPREG:[0-9]+]], sp
-; CHECK: str [[I128HI]], [x[[SPREG]], #24]
-; CHECK: str [[I128LO]], [x[[SPREG]], #16]
+; CHECK: stp [[I128LO]], [[I128HI]], [sp, #16]
+
+; CHECK-NONEON: ldr [[I128LO:x[0-9]+]], [{{x[0-9]+}}, :lo12:var128]
+; CHECK-NONEON: ldr [[I128HI:x[0-9]+]], [{{x[0-9]+}}, #8]
+; CHECK-NONEON: stp [[I128LO]], [[I128HI]], [sp, #16]
; CHECK: bl check_i128_stackalign
call void @check_i128_regalign(i32 0, i128 42)
; CHECK-NOT: mov x1
-; CHECK-LE: movz x2, #42
+; CHECK-LE: movz x2, #{{0x2a|42}}
; CHECK-LE: mov x3, xzr
-; CHECK-BE: movz x3, #42
+; CHECK-BE: movz {{x|w}}3, #{{0x2a|42}}
; CHECK-BE: mov x2, xzr
; CHECK: bl check_i128_regalign
@@ -143,7 +154,7 @@ define void @check_indirect_call() {
; CHECK-LABEL: check_indirect_call:
%func = load void()** @fptr
call void %func()
-; CHECK: ldr [[FPTR:x[0-9]+]], [{{x[0-9]+}}, #:lo12:fptr]
+; CHECK: ldr [[FPTR:x[0-9]+]], [{{x[0-9]+}}, {{#?}}:lo12:fptr]
; CHECK: blr [[FPTR]]
ret void
diff --git a/test/CodeGen/AArch64/global-alignment.ll b/test/CodeGen/AArch64/global-alignment.ll
index 56e5cba..451b9d6 100644
--- a/test/CodeGen/AArch64/global-alignment.ll
+++ b/test/CodeGen/AArch64/global-alignment.ll
@@ -1,8 +1,9 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
@var32 = global [3 x i32] zeroinitializer
@var64 = global [3 x i64] zeroinitializer
@var32_align64 = global [3 x i32] zeroinitializer, align 8
+@alias = alias [3 x i32]* @var32_align64
define i64 @test_align32() {
; CHECK-LABEL: test_align32:
@@ -12,7 +13,7 @@ define i64 @test_align32() {
; emit an "LDR x0, [x0, #:lo12:var32] instruction to implement this load.
%val = load i64* %addr
; CHECK: adrp [[HIBITS:x[0-9]+]], var32
-; CHECK: add x[[ADDR:[0-9]+]], [[HIBITS]], #:lo12:var32
+; CHECK: add x[[ADDR:[0-9]+]], [[HIBITS]], {{#?}}:lo12:var32
; CHECK: ldr x0, [x[[ADDR]]]
ret i64 %val
@@ -27,7 +28,7 @@ define i64 @test_align64() {
%val = load i64* %addr
; CHECK: adrp x[[HIBITS:[0-9]+]], var64
; CHECK-NOT: add x[[HIBITS]]
-; CHECK: ldr x0, [x[[HIBITS]], #:lo12:var64]
+; CHECK: ldr x0, [x[[HIBITS]], {{#?}}:lo12:var64]
ret i64 %val
}
@@ -41,7 +42,20 @@ define i64 @test_var32_align64() {
%val = load i64* %addr
; CHECK: adrp x[[HIBITS:[0-9]+]], var32_align64
; CHECK-NOT: add x[[HIBITS]]
-; CHECK: ldr x0, [x[[HIBITS]], #:lo12:var32_align64]
+; CHECK: ldr x0, [x[[HIBITS]], {{#?}}:lo12:var32_align64]
+
+ ret i64 %val
+}
+
+define i64 @test_var32_alias() {
+; CHECK-LABEL: test_var32_alias:
+ %addr = bitcast [3 x i32]* @alias to i64*
+
+ ; Test that we can find the alignment for aliases.
+ %val = load i64* %addr
+; CHECK: adrp x[[HIBITS:[0-9]+]], alias
+; CHECK-NOT: add x[[HIBITS]]
+; CHECK: ldr x0, [x[[HIBITS]], {{#?}}:lo12:alias]
ret i64 %val
}
@@ -56,7 +70,7 @@ define i64 @test_yet_another_var() {
; so we can't fold the load.
%val = load i64* bitcast({i32, i32}* @yet_another_var to i64*)
; CHECK: adrp [[HIBITS:x[0-9]+]], yet_another_var
-; CHECK: add x[[ADDR:[0-9]+]], [[HIBITS]], #:lo12:yet_another_var
+; CHECK: add x[[ADDR:[0-9]+]], [[HIBITS]], {{#?}}:lo12:yet_another_var
; CHECK: ldr x0, [x[[ADDR]]]
ret i64 %val
}
@@ -65,5 +79,5 @@ define i64()* @test_functions() {
; CHECK-LABEL: test_functions:
ret i64()* @test_yet_another_var
; CHECK: adrp [[HIBITS:x[0-9]+]], test_yet_another_var
-; CHECK: add x0, [[HIBITS]], #:lo12:test_yet_another_var
+; CHECK: add x0, [[HIBITS]], {{#?}}:lo12:test_yet_another_var
}
diff --git a/test/CodeGen/AArch64/got-abuse.ll b/test/CodeGen/AArch64/got-abuse.ll
index 8b06031..7a02b10 100644
--- a/test/CodeGen/AArch64/got-abuse.ll
+++ b/test/CodeGen/AArch64/got-abuse.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -filetype=obj < %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -filetype=obj -o - %s
; LLVM gives well-defined semantics to this horrible construct (though C says
; it's undefined). Regardless, we shouldn't crash. The important feature here is
@@ -17,7 +17,7 @@ define void @foo() nounwind {
entry:
call void @consume(i32 ptrtoint (void ()* @func to i32))
; CHECK: adrp x[[ADDRHI:[0-9]+]], :got:func
-; CHECK: ldr {{x[0-9]+}}, [x[[ADDRHI]], #:got_lo12:func]
+; CHECK: ldr {{x[0-9]+}}, [x[[ADDRHI]], {{#?}}:got_lo12:func]
ret void
}
diff --git a/test/CodeGen/AArch64/i1-contents.ll b/test/CodeGen/AArch64/i1-contents.ll
new file mode 100644
index 0000000..7f133fc
--- /dev/null
+++ b/test/CodeGen/AArch64/i1-contents.ll
@@ -0,0 +1,55 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s
+%big = type i32
+
+@var = global %big 0
+
+; AAPCS: low 8 bits of %in (== w0) will be either 0 or 1. Need to extend to
+; 32-bits.
+define void @consume_i1_arg(i1 %in) {
+; CHECK-LABEL: consume_i1_arg:
+; CHECK: and [[BOOL32:w[0-9]+]], w0, #{{0x1|0xff}}
+; CHECK: str [[BOOL32]], [{{x[0-9]+}}, :lo12:var]
+ %val = zext i1 %in to %big
+ store %big %val, %big* @var
+ ret void
+}
+
+; AAPCS: low 8 bits of %val1 (== w0) will be either 0 or 1. Need to extend to
+; 32-bits (doesn't really matter if it's from 1 or 8 bits).
+define void @consume_i1_ret() {
+; CHECK-LABEL: consume_i1_ret:
+; CHECK: bl produce_i1_ret
+; CHECK: and [[BOOL32:w[0-9]+]], w0, #{{0x1|0xff}}
+; CHECK: str [[BOOL32]], [{{x[0-9]+}}, :lo12:var]
+ %val1 = call i1 @produce_i1_ret()
+ %val = zext i1 %val1 to %big
+ store %big %val, %big* @var
+ ret void
+}
+
+; AAPCS: low 8 bits of w0 must be either 0 or 1. Need to mask them off.
+define i1 @produce_i1_ret() {
+; CHECK-LABEL: produce_i1_ret:
+; CHECK: ldr [[VAR32:w[0-9]+]], [{{x[0-9]+}}, :lo12:var]
+; CHECK: and w0, [[VAR32]], #{{0x1|0xff}}
+ %val = load %big* @var
+ %val1 = trunc %big %val to i1
+ ret i1 %val1
+}
+
+define void @produce_i1_arg() {
+; CHECK-LABEL: produce_i1_arg:
+; CHECK: ldr [[VAR32:w[0-9]+]], [{{x[0-9]+}}, :lo12:var]
+; CHECK: and w0, [[VAR32]], #{{0x1|0xff}}
+; CHECK: bl consume_i1_arg
+ %val = load %big* @var
+ %val1 = trunc %big %val to i1
+ call void @consume_i1_arg(i1 %val1)
+ ret void
+}
+
+
+;define zeroext i1 @foo(i8 %in) {
+; %val = trunc i8 %in to i1
+; ret i1 %val
+;}
diff --git a/test/CodeGen/AArch64/i128-align.ll b/test/CodeGen/AArch64/i128-align.ll
index 21ca7ed..a1b4d6f 100644
--- a/test/CodeGen/AArch64/i128-align.ll
+++ b/test/CodeGen/AArch64/i128-align.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-apple-ios7.0 -verify-machineinstrs -o - %s | FileCheck %s
%struct = type { i32, i128, i8 }
@@ -13,7 +13,7 @@ define i64 @check_size() {
%diff = sub i64 %endi, %starti
ret i64 %diff
-; CHECK: movz x0, #48
+; CHECK: {{movz x0, #48|orr w0, wzr, #0x30}}
}
define i64 @check_field() {
@@ -25,5 +25,5 @@ define i64 @check_field() {
%diff = sub i64 %endi, %starti
ret i64 %diff
-; CHECK: movz x0, #16
+; CHECK: {{movz x0, #16|orr w0, wzr, #0x10}}
}
diff --git a/test/CodeGen/AArch64/i128-shift.ll b/test/CodeGen/AArch64/i128-shift.ll
deleted file mode 100644
index d786d44..0000000
--- a/test/CodeGen/AArch64/i128-shift.ll
+++ /dev/null
@@ -1,43 +0,0 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
-
-define i128 @test_i128_lsl(i128 %a, i32 %shift) {
-; CHECK-LABEL: test_i128_lsl:
-
- %sh_prom = zext i32 %shift to i128
- %shl = shl i128 %a, %sh_prom
-
-; CHECK: movz [[SIXTYFOUR:x[0-9]+]], #64
-; CHECK-NEXT: sub [[REVSHAMT:x[0-9]+]], [[SIXTYFOUR]], [[SHAMT_32:w[0-9]+]], uxtw
-; CHECK-NEXT: lsr [[TMP1:x[0-9]+]], [[LO:x[0-9]+]], [[REVSHAMT]]
-; CHECK: lsl [[TMP2:x[0-9]+]], [[HI:x[0-9]+]], [[SHAMT:x[0-9]+]]
-; CHECK-NEXT: orr [[FALSEVAL:x[0-9]+]], [[TMP1]], [[TMP2]]
-; CHECK-NEXT: sub [[EXTRASHAMT:x[0-9]+]], [[SHAMT]], #64
-; CHECK-NEXT: lsl [[TMP3:x[0-9]+]], [[LO]], [[EXTRASHAMT]]
-; CHECK-NEXT: cmp [[EXTRASHAMT]], #0
-; CHECK-NEXT: csel [[RESULTHI:x[0-9]+]], [[TMP3]], [[FALSEVAL]], ge
-; CHECK-NEXT: lsl [[TMP4:x[0-9]+]], [[LO]], [[SHAMT]]
-; CHECK-NEXT: csel [[RESULTLO:x[0-9]+]], xzr, [[TMP4]], ge
-
- ret i128 %shl
-}
-
-define i128 @test_i128_shr(i128 %a, i32 %shift) {
-; CHECK-LABEL: test_i128_shr:
-
- %sh_prom = zext i32 %shift to i128
- %shr = lshr i128 %a, %sh_prom
-
-; CHECK: movz [[SIXTYFOUR]], #64
-; CHECK-NEXT: sub [[REVSHAMT:x[0-9]+]], [[SIXTYFOUR]], [[SHAMT_32:w[0-9]+]], uxtw
-; CHECK-NEXT: lsl [[TMP2:x[0-9]+]], [[HI:x[0-9]+]], [[REVSHAMT]]
-; CHECK: lsr [[TMP1:x[0-9]+]], [[LO:x[0-9]+]], [[SHAMT:x[0-9]+]]
-; CHECK-NEXT: orr [[FALSEVAL:x[0-9]+]], [[TMP1]], [[TMP2]]
-; CHECK-NEXT: sub [[EXTRASHAMT:x[0-9]+]], [[SHAMT]], #64
-; CHECK-NEXT: lsr [[TRUEVAL:x[0-9]+]], [[HI]], [[EXTRASHAMT]]
-; CHECK-NEXT: cmp [[EXTRASHAMT]], #0
-; CHECK-NEXT: csel [[RESULTLO:x[0-9]+]], [[TRUEVAL]], [[FALSEVAL]], ge
-; CHECK-NEXT: lsr [[TMP3:x[0-9]+]], [[HI]], [[SHAMT]]
-; CHECK-NEXT: csel [[RESULTHI:x[0-9]+]], xzr, [[TMP3]], ge
-
- ret i128 %shr
-}
diff --git a/test/CodeGen/AArch64/illegal-float-ops.ll b/test/CodeGen/AArch64/illegal-float-ops.ll
index 03c6d8d..9f7dd99 100644
--- a/test/CodeGen/AArch64/illegal-float-ops.ll
+++ b/test/CodeGen/AArch64/illegal-float-ops.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
@varfloat = global float 0.0
@vardouble = global double 0.0
diff --git a/test/CodeGen/AArch64/init-array.ll b/test/CodeGen/AArch64/init-array.ll
index 076ae27..f47b490 100644
--- a/test/CodeGen/AArch64/init-array.ll
+++ b/test/CodeGen/AArch64/init-array.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -use-init-array < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-none-none-eabi -verify-machineinstrs -use-init-array < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -use-init-array -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-none-eabi -verify-machineinstrs -use-init-array -o - %s | FileCheck %s
define internal void @_GLOBAL__I_a() section ".text.startup" {
ret void
diff --git a/test/CodeGen/AArch64/inline-asm-constraints-badI.ll b/test/CodeGen/AArch64/inline-asm-constraints-badI.ll
index 61bbfc2..9d833d9 100644
--- a/test/CodeGen/AArch64/inline-asm-constraints-badI.ll
+++ b/test/CodeGen/AArch64/inline-asm-constraints-badI.ll
@@ -1,7 +1,7 @@
-; RUN: not llc -mtriple=aarch64-none-linux-gnu < %s
+; RUN: not llc -mtriple=aarch64-none-linux-gnu -o - %s
define void @foo() {
; Out of range immediate for I.
- call void asm sideeffect "add x0, x0, $0", "I"(i32 4096)
+ call void asm sideeffect "add x0, x0, $0", "I"(i32 4097)
ret void
}
diff --git a/test/CodeGen/AArch64/inline-asm-constraints-badK.ll b/test/CodeGen/AArch64/inline-asm-constraints-badK.ll
index 40746e1..6ffc05d 100644
--- a/test/CodeGen/AArch64/inline-asm-constraints-badK.ll
+++ b/test/CodeGen/AArch64/inline-asm-constraints-badK.ll
@@ -1,4 +1,4 @@
-; RUN: not llc -mtriple=aarch64-none-linux-gnu < %s
+; RUN: not llc -mtriple=arm64-apple-ios7.0 -o - %s
define void @foo() {
; 32-bit bitpattern ending in 1101 can't be produced.
diff --git a/test/CodeGen/AArch64/inline-asm-constraints-badK2.ll b/test/CodeGen/AArch64/inline-asm-constraints-badK2.ll
index 2c53381..1726013 100644
--- a/test/CodeGen/AArch64/inline-asm-constraints-badK2.ll
+++ b/test/CodeGen/AArch64/inline-asm-constraints-badK2.ll
@@ -1,4 +1,4 @@
-; RUN: not llc -mtriple=aarch64-none-linux-gnu < %s
+; RUN: not llc -mtriple=aarch64-none-linux-gnu -o - %s
define void @foo() {
; 32-bit bitpattern ending in 1101 can't be produced.
diff --git a/test/CodeGen/AArch64/inline-asm-constraints-badL.ll b/test/CodeGen/AArch64/inline-asm-constraints-badL.ll
index d82d5a2..3c2f60c 100644
--- a/test/CodeGen/AArch64/inline-asm-constraints-badL.ll
+++ b/test/CodeGen/AArch64/inline-asm-constraints-badL.ll
@@ -1,4 +1,4 @@
-; RUN: not llc -mtriple=aarch64-none-linux-gnu < %s
+; RUN: not llc -mtriple=arm64-apple-ios7.0 -o - %s
define void @foo() {
; 32-bit bitpattern ending in 1101 can't be produced.
diff --git a/test/CodeGen/AArch64/inline-asm-constraints.ll b/test/CodeGen/AArch64/inline-asm-constraints.ll
deleted file mode 100644
index 365453c..0000000
--- a/test/CodeGen/AArch64/inline-asm-constraints.ll
+++ /dev/null
@@ -1,137 +0,0 @@
-;RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon -no-integrated-as < %s | FileCheck %s
-
-define i64 @test_inline_constraint_r(i64 %base, i32 %offset) {
-; CHECK-LABEL: test_inline_constraint_r:
- %val = call i64 asm "add $0, $1, $2, sxtw", "=r,r,r"(i64 %base, i32 %offset)
-; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, sxtw
- ret i64 %val
-}
-
-define i16 @test_small_reg(i16 %lhs, i16 %rhs) {
-; CHECK-LABEL: test_small_reg:
- %val = call i16 asm sideeffect "add $0, $1, $2, sxth", "=r,r,r"(i16 %lhs, i16 %rhs)
-; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, sxth
- ret i16 %val
-}
-
-define i64 @test_inline_constraint_r_imm(i64 %base, i32 %offset) {
-; CHECK-LABEL: test_inline_constraint_r_imm:
- %val = call i64 asm "add $0, $1, $2, sxtw", "=r,r,r"(i64 4, i32 12)
-; CHECK: movz [[FOUR:x[0-9]+]], #4
-; CHECK: movz [[TWELVE:w[0-9]+]], #12
-; CHECK: add {{x[0-9]+}}, [[FOUR]], [[TWELVE]], sxtw
- ret i64 %val
-}
-
-; m is permitted to have a base/offset form. We don't do that
-; currently though.
-define i32 @test_inline_constraint_m(i32 *%ptr) {
-; CHECK-LABEL: test_inline_constraint_m:
- %val = call i32 asm "ldr $0, $1", "=r,m"(i32 *%ptr)
-; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}]
- ret i32 %val
-}
-
-@arr = global [8 x i32] zeroinitializer
-
-; Q should *never* have base/offset form even if given the chance.
-define i32 @test_inline_constraint_Q(i32 *%ptr) {
-; CHECK-LABEL: test_inline_constraint_Q:
- %val = call i32 asm "ldr $0, $1", "=r,Q"(i32* getelementptr([8 x i32]* @arr, i32 0, i32 1))
-; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}]
- ret i32 %val
-}
-
-@dump = global fp128 zeroinitializer
-
-define void @test_inline_constraint_w(<8 x i8> %vec64, <4 x float> %vec128, half %hlf, float %flt, double %dbl, fp128 %quad) {
-; CHECK: test_inline_constraint_w:
- call <8 x i8> asm sideeffect "add $0.8b, $1.8b, $1.8b", "=w,w"(<8 x i8> %vec64)
- call <8 x i8> asm sideeffect "fadd $0.4s, $1.4s, $1.4s", "=w,w"(<4 x float> %vec128)
-; CHECK: add {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-; CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-
- ; Arguably semantically dodgy to output "vN", but it's what GCC does
- ; so purely for compatibility we want vector registers to be output.
- call float asm sideeffect "fcvt ${0:s}, ${1:h}", "=w,w"(half undef)
- call float asm sideeffect "fadd $0.2s, $0.2s, $0.2s", "=w,w"(float %flt)
- call double asm sideeffect "fadd $0.2d, $0.2d, $0.2d", "=w,w"(double %dbl)
- call fp128 asm sideeffect "fadd $0.2d, $0.2d, $0.2d", "=w,w"(fp128 %quad)
-; CHECK: fcvt {{s[0-9]+}}, {{h[0-9]+}}
-; CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-; CHECK: fadd {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-; CHECK: fadd {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
- ret void
-}
-
-define void @test_inline_constraint_I() {
-; CHECK-LABEL: test_inline_constraint_I:
- call void asm sideeffect "add x0, x0, $0", "I"(i32 0)
- call void asm sideeffect "add x0, x0, $0", "I"(i64 4095)
-; CHECK: add x0, x0, #0
-; CHECK: add x0, x0, #4095
-
- ret void
-}
-
-; Skip J because it's useless
-
-define void @test_inline_constraint_K() {
-; CHECK-LABEL: test_inline_constraint_K:
- call void asm sideeffect "and w0, w0, $0", "K"(i32 2863311530) ; = 0xaaaaaaaa
- call void asm sideeffect "and w0, w0, $0", "K"(i32 65535)
-; CHECK: and w0, w0, #-1431655766
-; CHECK: and w0, w0, #65535
-
- ret void
-}
-
-define void @test_inline_constraint_L() {
-; CHECK-LABEL: test_inline_constraint_L:
- call void asm sideeffect "and x0, x0, $0", "L"(i64 4294967296) ; = 0xaaaaaaaa
- call void asm sideeffect "and x0, x0, $0", "L"(i64 65535)
-; CHECK: and x0, x0, #4294967296
-; CHECK: and x0, x0, #65535
-
- ret void
-}
-
-; Skip M and N because we don't support MOV pseudo-instructions yet.
-
-@var = global i32 0
-
-define void @test_inline_constraint_S() {
-; CHECK-LABEL: test_inline_constraint_S:
- call void asm sideeffect "adrp x0, $0", "S"(i32* @var)
- call void asm sideeffect "adrp x0, ${0:A}", "S"(i32* @var)
- call void asm sideeffect "add x0, x0, ${0:L}", "S"(i32* @var)
-; CHECK: adrp x0, var
-; CHECK: adrp x0, var
-; CHECK: add x0, x0, #:lo12:var
- ret void
-}
-
-define i32 @test_inline_constraint_S_label(i1 %in) {
-; CHECK-LABEL: test_inline_constraint_S_label:
- call void asm sideeffect "adr x0, $0", "S"(i8* blockaddress(@test_inline_constraint_S_label, %loc))
-; CHECK: adr x0, .Ltmp{{[0-9]+}}
- br i1 %in, label %loc, label %loc2
-loc:
- ret i32 0
-loc2:
- ret i32 42
-}
-
-define void @test_inline_constraint_Y() {
-; CHECK-LABEL: test_inline_constraint_Y:
- call void asm sideeffect "fcmp s0, $0", "Y"(float 0.0)
-; CHECK: fcmp s0, #0.0
- ret void
-}
-
-define void @test_inline_constraint_Z() {
-; CHECK-LABEL: test_inline_constraint_Z:
- call void asm sideeffect "cmp w0, $0", "Z"(i32 0)
-; CHECK: cmp w0, #0
- ret void
-}
diff --git a/test/CodeGen/AArch64/inline-asm-modifiers.ll b/test/CodeGen/AArch64/inline-asm-modifiers.ll
deleted file mode 100644
index cb66335..0000000
--- a/test/CodeGen/AArch64/inline-asm-modifiers.ll
+++ /dev/null
@@ -1,147 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -no-integrated-as < %s | FileCheck %s
-
-@var_simple = hidden global i32 0
-@var_got = global i32 0
-@var_tlsgd = thread_local global i32 0
-@var_tlsld = thread_local(localdynamic) global i32 0
-@var_tlsie = thread_local(initialexec) global i32 0
-@var_tlsle = thread_local(localexec) global i32 0
-
-define void @test_inline_modifier_L() nounwind {
-; CHECK-LABEL: test_inline_modifier_L:
- call void asm sideeffect "add x0, x0, ${0:L}", "S,~{x0}"(i32* @var_simple)
- call void asm sideeffect "ldr x0, [x0, ${0:L}]", "S,~{x0}"(i32* @var_got)
- call void asm sideeffect "add x0, x0, ${0:L}", "S,~{x0}"(i32* @var_tlsgd)
- call void asm sideeffect "add x0, x0, ${0:L}", "S,~{x0}"(i32* @var_tlsld)
- call void asm sideeffect "ldr x0, [x0, ${0:L}]", "S,~{x0}"(i32* @var_tlsie)
- call void asm sideeffect "add x0, x0, ${0:L}", "S,~{x0}"(i32* @var_tlsle)
-; CHECK: add x0, x0, #:lo12:var_simple
-; CHECK: ldr x0, [x0, #:got_lo12:var_got]
-; CHECK: add x0, x0, #:tlsdesc_lo12:var_tlsgd
-; CHECK: add x0, x0, #:dtprel_lo12:var_tlsld
-; CHECK: ldr x0, [x0, #:gottprel_lo12:var_tlsie]
-; CHECK: add x0, x0, #:tprel_lo12:var_tlsle
-
- call void asm sideeffect "add x0, x0, ${0:L}", "Si,~{x0}"(i32 64)
- call void asm sideeffect "ldr x0, [x0, ${0:L}]", "Si,~{x0}"(i32 64)
-; CHECK: add x0, x0, #64
-; CHECK: ldr x0, [x0, #64]
-
- ret void
-}
-
-define void @test_inline_modifier_G() nounwind {
-; CHECK-LABEL: test_inline_modifier_G:
- call void asm sideeffect "add x0, x0, ${0:G}, lsl #12", "S,~{x0}"(i32* @var_tlsld)
- call void asm sideeffect "add x0, x0, ${0:G}, lsl #12", "S,~{x0}"(i32* @var_tlsle)
-; CHECK: add x0, x0, #:dtprel_hi12:var_tlsld, lsl #12
-; CHECK: add x0, x0, #:tprel_hi12:var_tlsle, lsl #12
-
- call void asm sideeffect "add x0, x0, ${0:G}", "Si,~{x0}"(i32 42)
-; CHECK: add x0, x0, #42
- ret void
-}
-
-define void @test_inline_modifier_A() nounwind {
-; CHECK-LABEL: test_inline_modifier_A:
- call void asm sideeffect "adrp x0, ${0:A}", "S,~{x0}"(i32* @var_simple)
- call void asm sideeffect "adrp x0, ${0:A}", "S,~{x0}"(i32* @var_got)
- call void asm sideeffect "adrp x0, ${0:A}", "S,~{x0}"(i32* @var_tlsgd)
- call void asm sideeffect "adrp x0, ${0:A}", "S,~{x0}"(i32* @var_tlsie)
- ; N.b. All tprel and dtprel relocs are modified: lo12 or granules.
-; CHECK: adrp x0, var_simple
-; CHECK: adrp x0, :got:var_got
-; CHECK: adrp x0, :tlsdesc:var_tlsgd
-; CHECK: adrp x0, :gottprel:var_tlsie
-
- call void asm sideeffect "adrp x0, ${0:A}", "Si,~{x0}"(i32 40)
-; CHECK: adrp x0, #40
-
- ret void
-}
-
-define void @test_inline_modifier_wx(i32 %small, i64 %big) nounwind {
-; CHECK-LABEL: test_inline_modifier_wx:
- call i32 asm sideeffect "add $0, $0, $0", "=r,0"(i32 %small)
- call i32 asm sideeffect "add ${0:w}, ${0:w}, ${0:w}", "=r,0"(i32 %small)
- call i32 asm sideeffect "add ${0:x}, ${0:x}, ${0:x}", "=r,0"(i32 %small)
-; CHECK: //APP
-; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-
- call i64 asm sideeffect "add $0, $0, $0", "=r,0"(i64 %big)
- call i64 asm sideeffect "add ${0:w}, ${0:w}, ${0:w}", "=r,0"(i64 %big)
- call i64 asm sideeffect "add ${0:x}, ${0:x}, ${0:x}", "=r,0"(i64 %big)
-; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-
- call i32 asm sideeffect "add ${0:w}, ${1:w}, ${1:w}", "=r,r"(i32 0)
- call i32 asm sideeffect "add ${0:x}, ${1:x}, ${1:x}", "=r,r"(i32 0)
-; CHECK: add {{w[0-9]+}}, wzr, wzr
-; CHECK: add {{x[0-9]+}}, xzr, xzr
-
- call i32 asm sideeffect "add ${0:w}, ${0:w}, ${1:w}", "=r,Ir,0"(i32 123, i32 %small)
- call i64 asm sideeffect "add ${0:x}, ${0:x}, ${1:x}", "=r,Ir,0"(i32 456, i64 %big)
-; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, #123
-; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #456
-
- ret void
-}
-
-define void @test_inline_modifier_bhsdq() nounwind {
-; CHECK-LABEL: test_inline_modifier_bhsdq:
- call float asm sideeffect "ldr ${0:b}, [sp]", "=w"()
- call float asm sideeffect "ldr ${0:h}, [sp]", "=w"()
- call float asm sideeffect "ldr ${0:s}, [sp]", "=w"()
- call float asm sideeffect "ldr ${0:d}, [sp]", "=w"()
- call float asm sideeffect "ldr ${0:q}, [sp]", "=w"()
-; CHECK: ldr b0, [sp]
-; CHECK: ldr h0, [sp]
-; CHECK: ldr s0, [sp]
-; CHECK: ldr d0, [sp]
-; CHECK: ldr q0, [sp]
-
- call double asm sideeffect "ldr ${0:b}, [sp]", "=w"()
- call double asm sideeffect "ldr ${0:h}, [sp]", "=w"()
- call double asm sideeffect "ldr ${0:s}, [sp]", "=w"()
- call double asm sideeffect "ldr ${0:d}, [sp]", "=w"()
- call double asm sideeffect "ldr ${0:q}, [sp]", "=w"()
-; CHECK: ldr b0, [sp]
-; CHECK: ldr h0, [sp]
-; CHECK: ldr s0, [sp]
-; CHECK: ldr d0, [sp]
-; CHECK: ldr q0, [sp]
-
- call void asm sideeffect "fcmp b0, ${0:b}", "Yw"(float 0.0)
- call void asm sideeffect "fcmp h0, ${0:h}", "Yw"(float 0.0)
- call void asm sideeffect "fcmp s0, ${0:s}", "Yw"(float 0.0)
- call void asm sideeffect "fcmp d0, ${0:d}", "Yw"(float 0.0)
- call void asm sideeffect "fcmp q0, ${0:q}", "Yw"(float 0.0)
-; CHECK: fcmp b0, #0
-; CHECK: fcmp h0, #0
-; CHECK: fcmp s0, #0
-; CHECK: fcmp d0, #0
-; CHECK: fcmp q0, #0
-
- ret void
-}
-
-define void @test_inline_modifier_c() nounwind {
-; CHECK-LABEL: test_inline_modifier_c:
- call void asm sideeffect "adr x0, ${0:c}", "i"(i32 3)
-; CHECK: adr x0, 3
-
- ret void
-}
-
-define void @test_inline_modifier_a() nounwind {
-; CHECK-LABEL: test_inline_modifier_a:
- call void asm sideeffect "prfm pldl1keep, ${0:a}", "r"(i32* @var_simple)
-; CHECK: adrp [[VARHI:x[0-9]+]], var_simple
-; CHECK: add x[[VARADDR:[0-9]+]], [[VARHI]], #:lo12:var_simple
-; CHECK: prfm pldl1keep, [x[[VARADDR]]]
- ret void
-}
-
diff --git a/test/CodeGen/AArch64/jump-table.ll b/test/CodeGen/AArch64/jump-table.ll
index 94717f5..1dfb789 100644
--- a/test/CodeGen/AArch64/jump-table.ll
+++ b/test/CodeGen/AArch64/jump-table.ll
@@ -1,6 +1,6 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
-; RUN: llc -code-model=large -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck --check-prefix=CHECK-LARGE %s
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -relocation-model=pic <%s | FileCheck --check-prefix=CHECK-PIC %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -code-model=large -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu | FileCheck --check-prefix=CHECK-LARGE %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -relocation-model=pic -o - %s | FileCheck --check-prefix=CHECK-PIC %s
define i32 @test_jumptable(i32 %in) {
; CHECK: test_jumptable
@@ -12,7 +12,7 @@ define i32 @test_jumptable(i32 %in) {
i32 4, label %lbl4
]
; CHECK: adrp [[JTPAGE:x[0-9]+]], .LJTI0_0
-; CHECK: add x[[JT:[0-9]+]], [[JTPAGE]], #:lo12:.LJTI0_0
+; CHECK: add x[[JT:[0-9]+]], [[JTPAGE]], {{#?}}:lo12:.LJTI0_0
; CHECK: ldr [[DEST:x[0-9]+]], [x[[JT]], {{x[0-9]+}}, lsl #3]
; CHECK: br [[DEST]]
@@ -24,7 +24,7 @@ define i32 @test_jumptable(i32 %in) {
; CHECK-LARGE: br [[DEST]]
; CHECK-PIC: adrp [[JTPAGE:x[0-9]+]], .LJTI0_0
-; CHECK-PIC: add x[[JT:[0-9]+]], [[JTPAGE]], #:lo12:.LJTI0_0
+; CHECK-PIC: add x[[JT:[0-9]+]], [[JTPAGE]], {{#?}}:lo12:.LJTI0_0
; CHECK-PIC: ldrsw [[DEST:x[0-9]+]], [x[[JT]], {{x[0-9]+}}, lsl #2]
; CHECK-PIC: add [[TABLE:x[0-9]+]], [[DEST]], x[[JT]]
; CHECK-PIC: br [[TABLE]]
diff --git a/test/CodeGen/AArch64/large-consts.ll b/test/CodeGen/AArch64/large-consts.ll
index 1b769c6..6bf85e8 100644
--- a/test/CodeGen/AArch64/large-consts.ll
+++ b/test/CodeGen/AArch64/large-consts.ll
@@ -4,10 +4,11 @@
; it's not the linker's job to put it there.
define double @foo() {
-; CHECK: movz [[CPADDR:x[0-9]+]], #:abs_g3:.LCPI0_0 // encoding: [A,A,0xe0'A',0xd2'A']
-; CHECK: movk [[CPADDR]], #:abs_g2_nc:.LCPI0_0 // encoding: [A,A,0xc0'A',0xf2'A']
-; CHECK: movk [[CPADDR]], #:abs_g1_nc:.LCPI0_0 // encoding: [A,A,0xa0'A',0xf2'A']
-; CHECK: movk [[CPADDR]], #:abs_g0_nc:.LCPI0_0 // encoding: [A,A,0x80'A',0xf2'A']
+
+; CHECK: movz [[CPADDR:x[0-9]+]], #:abs_g3:.LCPI0_0 // encoding: [0bAAA01000,A,0b111AAAAA,0xd2]
+; CHECK: movk [[CPADDR]], #:abs_g2_nc:.LCPI0_0 // encoding: [0bAAA01000,A,0b110AAAAA,0xf2]
+; CHECK: movk [[CPADDR]], #:abs_g1_nc:.LCPI0_0 // encoding: [0bAAA01000,A,0b101AAAAA,0xf2]
+; CHECK: movk [[CPADDR]], #:abs_g0_nc:.LCPI0_0 // encoding: [0bAAA01000,A,0b100AAAAA,0xf2]
ret double 3.14159
}
diff --git a/test/CodeGen/AArch64/large-frame.ll b/test/CodeGen/AArch64/large-frame.ll
deleted file mode 100644
index fde3036..0000000
--- a/test/CodeGen/AArch64/large-frame.ll
+++ /dev/null
@@ -1,119 +0,0 @@
-; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
-declare void @use_addr(i8*)
-
-@addr = global i8* null
-
-define void @test_bigframe() {
-; CHECK-LABEL: test_bigframe:
-; CHECK: .cfi_startproc
-
- %var1 = alloca i8, i32 20000000
- %var2 = alloca i8, i32 16
- %var3 = alloca i8, i32 20000000
-; CHECK: sub sp, sp, #496
-; CHECK: .cfi_def_cfa sp, 496
-; CHECK: str x30, [sp, #488]
- ; Total adjust is 39999536
-; CHECK: movz [[SUBCONST:x[0-9]+]], #22576
-; CHECK: movk [[SUBCONST]], #610, lsl #16
-; CHECK: sub sp, sp, [[SUBCONST]]
-; CHECK: .cfi_def_cfa sp, 40000032
-; CHECK: .cfi_offset x30, -8
-
- ; Total offset is 20000024
-; CHECK: movz [[VAR1OFFSET:x[0-9]+]], #11544
-; CHECK: movk [[VAR1OFFSET]], #305, lsl #16
-; CHECK: add {{x[0-9]+}}, sp, [[VAR1OFFSET]]
- store volatile i8* %var1, i8** @addr
-
- %var1plus2 = getelementptr i8* %var1, i32 2
- store volatile i8* %var1plus2, i8** @addr
-
-; CHECK: movz [[VAR2OFFSET:x[0-9]+]], #11528
-; CHECK: movk [[VAR2OFFSET]], #305, lsl #16
-; CHECK: add {{x[0-9]+}}, sp, [[VAR2OFFSET]]
- store volatile i8* %var2, i8** @addr
-
- %var2plus2 = getelementptr i8* %var2, i32 2
- store volatile i8* %var2plus2, i8** @addr
-
- store volatile i8* %var3, i8** @addr
-
- %var3plus2 = getelementptr i8* %var3, i32 2
- store volatile i8* %var3plus2, i8** @addr
-
-; CHECK: movz [[ADDCONST:x[0-9]+]], #22576
-; CHECK: movk [[ADDCONST]], #610, lsl #16
-; CHECK: add sp, sp, [[ADDCONST]]
-; CHECK: .cfi_endproc
- ret void
-}
-
-define void @test_mediumframe() {
-; CHECK-LABEL: test_mediumframe:
- %var1 = alloca i8, i32 1000000
- %var2 = alloca i8, i32 16
- %var3 = alloca i8, i32 1000000
-; CHECK: sub sp, sp, #496
-; CHECK: str x30, [sp, #488]
-; CHECK: sub sp, sp, #688
-; CHECK-NEXT: sub sp, sp, #488, lsl #12
-
- store volatile i8* %var1, i8** @addr
-; CHECK: add [[VAR1ADDR:x[0-9]+]], sp, #600
-; CHECK: add [[VAR1ADDR]], [[VAR1ADDR]], #244, lsl #12
-
- %var1plus2 = getelementptr i8* %var1, i32 2
- store volatile i8* %var1plus2, i8** @addr
-; CHECK: add [[VAR1PLUS2:x[0-9]+]], {{x[0-9]+}}, #2
-
- store volatile i8* %var2, i8** @addr
-; CHECK: add [[VAR2ADDR:x[0-9]+]], sp, #584
-; CHECK: add [[VAR2ADDR]], [[VAR2ADDR]], #244, lsl #12
-
- %var2plus2 = getelementptr i8* %var2, i32 2
- store volatile i8* %var2plus2, i8** @addr
-; CHECK: add [[VAR2PLUS2:x[0-9]+]], {{x[0-9]+}}, #2
-
- store volatile i8* %var3, i8** @addr
-
- %var3plus2 = getelementptr i8* %var3, i32 2
- store volatile i8* %var3plus2, i8** @addr
-
-; CHECK: add sp, sp, #688
-; CHECK: add sp, sp, #488, lsl #12
-; CHECK: ldr x30, [sp, #488]
-; CHECK: add sp, sp, #496
- ret void
-}
-
-
-@bigspace = global [8 x i64] zeroinitializer
-
-; If temporary registers are allocated for adjustment, they should *not* clobber
-; argument registers.
-define void @test_tempallocation([8 x i64] %val) nounwind {
-; CHECK-LABEL: test_tempallocation:
- %var = alloca i8, i32 1000000
-; CHECK: sub sp, sp,
-
-; Make sure the prologue is reasonably efficient
-; CHECK-NEXT: stp x29, x30, [sp,
-; CHECK-NEXT: stp x25, x26, [sp,
-; CHECK-NEXT: stp x23, x24, [sp,
-; CHECK-NEXT: stp x21, x22, [sp,
-; CHECK-NEXT: stp x19, x20, [sp,
-
-; Make sure we don't trash an argument register
-; CHECK-NOT: movz {{x[0-7],}}
-; CHECK: sub sp, sp,
-
-; CHECK-NOT: movz {{x[0-7],}}
-
-; CHECK: bl use_addr
- call void @use_addr(i8* %var)
-
- store [8 x i64] %val, [8 x i64]* @bigspace
- ret void
-; CHECK: ret
-}
diff --git a/test/CodeGen/AArch64/ldst-opt.ll b/test/CodeGen/AArch64/ldst-opt.ll
new file mode 100644
index 0000000..1ce5c95
--- /dev/null
+++ b/test/CodeGen/AArch64/ldst-opt.ll
@@ -0,0 +1,301 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
+
+; This file contains tests for the AArch64 load/store optimizer.
+
+%padding = type { i8*, i8*, i8*, i8* }
+%s.word = type { i32, i32 }
+%s.doubleword = type { i64, i32 }
+%s.quadword = type { fp128, i32 }
+%s.float = type { float, i32 }
+%s.double = type { double, i32 }
+%struct.word = type { %padding, %s.word }
+%struct.doubleword = type { %padding, %s.doubleword }
+%struct.quadword = type { %padding, %s.quadword }
+%struct.float = type { %padding, %s.float }
+%struct.double = type { %padding, %s.double }
+
+; Check the following transform:
+;
+; (ldr|str) X, [x0, #32]
+; ...
+; add x0, x0, #32
+; ->
+; (ldr|str) X, [x0, #32]!
+;
+; with X being either w1, x1, s0, d0 or q0.
+
+declare void @bar_word(%s.word*, i32)
+
+define void @load-pre-indexed-word(%struct.word* %ptr) nounwind {
+; CHECK-LABEL: load-pre-indexed-word
+; CHECK: ldr w{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+entry:
+ %a = getelementptr inbounds %struct.word* %ptr, i64 0, i32 1, i32 0
+ %add = load i32* %a, align 4
+ br label %bar
+bar:
+ %c = getelementptr inbounds %struct.word* %ptr, i64 0, i32 1
+ tail call void @bar_word(%s.word* %c, i32 %add)
+ ret void
+}
+
+define void @store-pre-indexed-word(%struct.word* %ptr, i32 %val) nounwind {
+; CHECK-LABEL: store-pre-indexed-word
+; CHECK: str w{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+entry:
+ %a = getelementptr inbounds %struct.word* %ptr, i64 0, i32 1, i32 0
+ store i32 %val, i32* %a, align 4
+ br label %bar
+bar:
+ %c = getelementptr inbounds %struct.word* %ptr, i64 0, i32 1
+ tail call void @bar_word(%s.word* %c, i32 %val)
+ ret void
+}
+
+declare void @bar_doubleword(%s.doubleword*, i64)
+
+define void @load-pre-indexed-doubleword(%struct.doubleword* %ptr) nounwind {
+; CHECK-LABEL: load-pre-indexed-doubleword
+; CHECK: ldr x{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+entry:
+ %a = getelementptr inbounds %struct.doubleword* %ptr, i64 0, i32 1, i32 0
+ %add = load i64* %a, align 4
+ br label %bar
+bar:
+ %c = getelementptr inbounds %struct.doubleword* %ptr, i64 0, i32 1
+ tail call void @bar_doubleword(%s.doubleword* %c, i64 %add)
+ ret void
+}
+
+define void @store-pre-indexed-doubleword(%struct.doubleword* %ptr, i64 %val) nounwind {
+; CHECK-LABEL: store-pre-indexed-doubleword
+; CHECK: str x{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+entry:
+ %a = getelementptr inbounds %struct.doubleword* %ptr, i64 0, i32 1, i32 0
+ store i64 %val, i64* %a, align 4
+ br label %bar
+bar:
+ %c = getelementptr inbounds %struct.doubleword* %ptr, i64 0, i32 1
+ tail call void @bar_doubleword(%s.doubleword* %c, i64 %val)
+ ret void
+}
+
+declare void @bar_quadword(%s.quadword*, fp128)
+
+define void @load-pre-indexed-quadword(%struct.quadword* %ptr) nounwind {
+; CHECK-LABEL: load-pre-indexed-quadword
+; CHECK: ldr q{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+entry:
+ %a = getelementptr inbounds %struct.quadword* %ptr, i64 0, i32 1, i32 0
+ %add = load fp128* %a, align 4
+ br label %bar
+bar:
+ %c = getelementptr inbounds %struct.quadword* %ptr, i64 0, i32 1
+ tail call void @bar_quadword(%s.quadword* %c, fp128 %add)
+ ret void
+}
+
+define void @store-pre-indexed-quadword(%struct.quadword* %ptr, fp128 %val) nounwind {
+; CHECK-LABEL: store-pre-indexed-quadword
+; CHECK: str q{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+entry:
+ %a = getelementptr inbounds %struct.quadword* %ptr, i64 0, i32 1, i32 0
+ store fp128 %val, fp128* %a, align 4
+ br label %bar
+bar:
+ %c = getelementptr inbounds %struct.quadword* %ptr, i64 0, i32 1
+ tail call void @bar_quadword(%s.quadword* %c, fp128 %val)
+ ret void
+}
+
+declare void @bar_float(%s.float*, float)
+
+define void @load-pre-indexed-float(%struct.float* %ptr) nounwind {
+; CHECK-LABEL: load-pre-indexed-float
+; CHECK: ldr s{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+entry:
+ %a = getelementptr inbounds %struct.float* %ptr, i64 0, i32 1, i32 0
+ %add = load float* %a, align 4
+ br label %bar
+bar:
+ %c = getelementptr inbounds %struct.float* %ptr, i64 0, i32 1
+ tail call void @bar_float(%s.float* %c, float %add)
+ ret void
+}
+
+define void @store-pre-indexed-float(%struct.float* %ptr, float %val) nounwind {
+; CHECK-LABEL: store-pre-indexed-float
+; CHECK: str s{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+entry:
+ %a = getelementptr inbounds %struct.float* %ptr, i64 0, i32 1, i32 0
+ store float %val, float* %a, align 4
+ br label %bar
+bar:
+ %c = getelementptr inbounds %struct.float* %ptr, i64 0, i32 1
+ tail call void @bar_float(%s.float* %c, float %val)
+ ret void
+}
+
+declare void @bar_double(%s.double*, double)
+
+define void @load-pre-indexed-double(%struct.double* %ptr) nounwind {
+; CHECK-LABEL: load-pre-indexed-double
+; CHECK: ldr d{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+entry:
+ %a = getelementptr inbounds %struct.double* %ptr, i64 0, i32 1, i32 0
+ %add = load double* %a, align 4
+ br label %bar
+bar:
+ %c = getelementptr inbounds %struct.double* %ptr, i64 0, i32 1
+ tail call void @bar_double(%s.double* %c, double %add)
+ ret void
+}
+
+define void @store-pre-indexed-double(%struct.double* %ptr, double %val) nounwind {
+; CHECK-LABEL: store-pre-indexed-double
+; CHECK: str d{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+entry:
+ %a = getelementptr inbounds %struct.double* %ptr, i64 0, i32 1, i32 0
+ store double %val, double* %a, align 4
+ br label %bar
+bar:
+ %c = getelementptr inbounds %struct.double* %ptr, i64 0, i32 1
+ tail call void @bar_double(%s.double* %c, double %val)
+ ret void
+}
+
+; Check the following transform:
+;
+; ldr X, [x20]
+; ...
+; add x20, x20, #32
+; ->
+; ldr X, [x20], #32
+;
+; with X being either w0, x0, s0, d0 or q0.
+
+define void @load-post-indexed-word(i32* %array, i64 %count) nounwind {
+; CHECK-LABEL: load-post-indexed-word
+; CHECK: ldr w{{[0-9]+}}, [x{{[0-9]+}}], #16
+entry:
+ %gep1 = getelementptr i32* %array, i64 2
+ br label %body
+
+body:
+ %iv2 = phi i32* [ %gep3, %body ], [ %gep1, %entry ]
+ %iv = phi i64 [ %iv.next, %body ], [ %count, %entry ]
+ %gep2 = getelementptr i32* %iv2, i64 -1
+ %load = load i32* %gep2
+ call void @use-word(i32 %load)
+ %load2 = load i32* %iv2
+ call void @use-word(i32 %load2)
+ %iv.next = add i64 %iv, -4
+ %gep3 = getelementptr i32* %iv2, i64 4
+ %cond = icmp eq i64 %iv.next, 0
+ br i1 %cond, label %exit, label %body
+
+exit:
+ ret void
+}
+
+define void @load-post-indexed-doubleword(i64* %array, i64 %count) nounwind {
+; CHECK-LABEL: load-post-indexed-doubleword
+; CHECK: ldr x{{[0-9]+}}, [x{{[0-9]+}}], #32
+entry:
+ %gep1 = getelementptr i64* %array, i64 2
+ br label %body
+
+body:
+ %iv2 = phi i64* [ %gep3, %body ], [ %gep1, %entry ]
+ %iv = phi i64 [ %iv.next, %body ], [ %count, %entry ]
+ %gep2 = getelementptr i64* %iv2, i64 -1
+ %load = load i64* %gep2
+ call void @use-doubleword(i64 %load)
+ %load2 = load i64* %iv2
+ call void @use-doubleword(i64 %load2)
+ %iv.next = add i64 %iv, -4
+ %gep3 = getelementptr i64* %iv2, i64 4
+ %cond = icmp eq i64 %iv.next, 0
+ br i1 %cond, label %exit, label %body
+
+exit:
+ ret void
+}
+
+define void @load-post-indexed-quadword(<2 x i64>* %array, i64 %count) nounwind {
+; CHECK-LABEL: load-post-indexed-quadword
+; CHECK: ldr q{{[0-9]+}}, [x{{[0-9]+}}], #64
+entry:
+ %gep1 = getelementptr <2 x i64>* %array, i64 2
+ br label %body
+
+body:
+ %iv2 = phi <2 x i64>* [ %gep3, %body ], [ %gep1, %entry ]
+ %iv = phi i64 [ %iv.next, %body ], [ %count, %entry ]
+ %gep2 = getelementptr <2 x i64>* %iv2, i64 -1
+ %load = load <2 x i64>* %gep2
+ call void @use-quadword(<2 x i64> %load)
+ %load2 = load <2 x i64>* %iv2
+ call void @use-quadword(<2 x i64> %load2)
+ %iv.next = add i64 %iv, -4
+ %gep3 = getelementptr <2 x i64>* %iv2, i64 4
+ %cond = icmp eq i64 %iv.next, 0
+ br i1 %cond, label %exit, label %body
+
+exit:
+ ret void
+}
+
+define void @load-post-indexed-float(float* %array, i64 %count) nounwind {
+; CHECK-LABEL: load-post-indexed-float
+; CHECK: ldr s{{[0-9]+}}, [x{{[0-9]+}}], #16
+entry:
+ %gep1 = getelementptr float* %array, i64 2
+ br label %body
+
+body:
+ %iv2 = phi float* [ %gep3, %body ], [ %gep1, %entry ]
+ %iv = phi i64 [ %iv.next, %body ], [ %count, %entry ]
+ %gep2 = getelementptr float* %iv2, i64 -1
+ %load = load float* %gep2
+ call void @use-float(float %load)
+ %load2 = load float* %iv2
+ call void @use-float(float %load2)
+ %iv.next = add i64 %iv, -4
+ %gep3 = getelementptr float* %iv2, i64 4
+ %cond = icmp eq i64 %iv.next, 0
+ br i1 %cond, label %exit, label %body
+
+exit:
+ ret void
+}
+
+define void @load-post-indexed-double(double* %array, i64 %count) nounwind {
+; CHECK-LABEL: load-post-indexed-double
+; CHECK: ldr d{{[0-9]+}}, [x{{[0-9]+}}], #32
+entry:
+ %gep1 = getelementptr double* %array, i64 2
+ br label %body
+
+body:
+ %iv2 = phi double* [ %gep3, %body ], [ %gep1, %entry ]
+ %iv = phi i64 [ %iv.next, %body ], [ %count, %entry ]
+ %gep2 = getelementptr double* %iv2, i64 -1
+ %load = load double* %gep2
+ call void @use-double(double %load)
+ %load2 = load double* %iv2
+ call void @use-double(double %load2)
+ %iv.next = add i64 %iv, -4
+ %gep3 = getelementptr double* %iv2, i64 4
+ %cond = icmp eq i64 %iv.next, 0
+ br i1 %cond, label %exit, label %body
+
+exit:
+ ret void
+}
+
+declare void @use-word(i32)
+declare void @use-doubleword(i64)
+declare void @use-quadword(<2 x i64>)
+declare void @use-float(float)
+declare void @use-double(double)
diff --git a/test/CodeGen/AArch64/ldst-regoffset.ll b/test/CodeGen/AArch64/ldst-regoffset.ll
index db30fd9..e2fa08b 100644
--- a/test/CodeGen/AArch64/ldst-regoffset.ll
+++ b/test/CodeGen/AArch64/ldst-regoffset.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
@var_8bit = global i8 0
@@ -9,14 +9,14 @@
@var_float = global float 0.0
@var_double = global double 0.0
-define void @ldst_8bit(i8* %base, i32 %off32, i64 %off64) {
+define void @ldst_8bit(i8* %base, i32 %off32, i64 %off64) minsize {
; CHECK-LABEL: ldst_8bit:
%addr8_sxtw = getelementptr i8* %base, i32 %off32
%val8_sxtw = load volatile i8* %addr8_sxtw
%val32_signed = sext i8 %val8_sxtw to i32
store volatile i32 %val32_signed, i32* @var_32bit
-; CHECK: ldrsb {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+; CHECK: ldrsb {{w[0-9]+}}, [{{x[0-9]+}}, {{[wx][0-9]+}}, sxtw]
%addr_lsl = getelementptr i8* %base, i64 %off64
%val8_lsl = load volatile i8* %addr_lsl
@@ -31,20 +31,20 @@ define void @ldst_8bit(i8* %base, i32 %off32, i64 %off64) {
%val8_uxtw = load volatile i8* %addr_uxtw
%newval8 = add i8 %val8_uxtw, 1
store volatile i8 %newval8, i8* @var_8bit
-; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, uxtw]
ret void
}
-define void @ldst_16bit(i16* %base, i32 %off32, i64 %off64) {
+define void @ldst_16bit(i16* %base, i32 %off32, i64 %off64) minsize {
; CHECK-LABEL: ldst_16bit:
%addr8_sxtwN = getelementptr i16* %base, i32 %off32
%val8_sxtwN = load volatile i16* %addr8_sxtwN
%val32_signed = sext i16 %val8_sxtwN to i32
store volatile i32 %val32_signed, i32* @var_32bit
-; CHECK: ldrsh {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #1]
+; CHECK: ldrsh {{w[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw #1]
%addr_lslN = getelementptr i16* %base, i64 %off64
%val8_lslN = load volatile i16* %addr_lslN
@@ -59,7 +59,7 @@ define void @ldst_16bit(i16* %base, i32 %off32, i64 %off64) {
%val8_uxtw = load volatile i16* %addr_uxtw
%newval8 = add i16 %val8_uxtw, 1
store volatile i16 %newval8, i16* @var_16bit
-; CHECK: ldrh {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+; CHECK: ldrh {{w[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, uxtw]
%base_sxtw = ptrtoint i16* %base to i64
%offset_sxtw = sext i32 %off32 to i64
@@ -68,7 +68,7 @@ define void @ldst_16bit(i16* %base, i32 %off32, i64 %off64) {
%val16_sxtw = load volatile i16* %addr_sxtw
%val64_signed = sext i16 %val16_sxtw to i64
store volatile i64 %val64_signed, i64* @var_64bit
-; CHECK: ldrsh {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+; CHECK: ldrsh {{x[0-9]+}}, [{{x[0-9]+}}, {{[wx][0-9]+}}, sxtw]
%base_lsl = ptrtoint i16* %base to i64
@@ -87,17 +87,17 @@ define void @ldst_16bit(i16* %base, i32 %off32, i64 %off64) {
%val32 = load volatile i32* @var_32bit
%val16_trunc32 = trunc i32 %val32 to i16
store volatile i16 %val16_trunc32, i16* %addr_uxtwN
-; CHECK: strh {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #1]
+; CHECK: strh {{w[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, uxtw #1]
ret void
}
-define void @ldst_32bit(i32* %base, i32 %off32, i64 %off64) {
+define void @ldst_32bit(i32* %base, i32 %off32, i64 %off64) minsize {
; CHECK-LABEL: ldst_32bit:
%addr_sxtwN = getelementptr i32* %base, i32 %off32
%val_sxtwN = load volatile i32* %addr_sxtwN
store volatile i32 %val_sxtwN, i32* @var_32bit
-; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #2]
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw #2]
%addr_lslN = getelementptr i32* %base, i64 %off64
%val_lslN = load volatile i32* %addr_lslN
@@ -111,7 +111,7 @@ define void @ldst_32bit(i32* %base, i32 %off32, i64 %off64) {
%val_uxtw = load volatile i32* %addr_uxtw
%newval8 = add i32 %val_uxtw, 1
store volatile i32 %newval8, i32* @var_32bit
-; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, uxtw]
%base_sxtw = ptrtoint i32* %base to i64
@@ -121,7 +121,7 @@ define void @ldst_32bit(i32* %base, i32 %off32, i64 %off64) {
%val16_sxtw = load volatile i32* %addr_sxtw
%val64_signed = sext i32 %val16_sxtw to i64
store volatile i64 %val64_signed, i64* @var_64bit
-; CHECK: ldrsw {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+; CHECK: ldrsw {{x[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw]
%base_lsl = ptrtoint i32* %base to i64
@@ -139,17 +139,17 @@ define void @ldst_32bit(i32* %base, i32 %off32, i64 %off64) {
%addr_uxtwN = inttoptr i64 %addrint_uxtwN to i32*
%val32 = load volatile i32* @var_32bit
store volatile i32 %val32, i32* %addr_uxtwN
-; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #2]
+; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, uxtw #2]
ret void
}
-define void @ldst_64bit(i64* %base, i32 %off32, i64 %off64) {
+define void @ldst_64bit(i64* %base, i32 %off32, i64 %off64) minsize {
; CHECK-LABEL: ldst_64bit:
%addr_sxtwN = getelementptr i64* %base, i32 %off32
%val_sxtwN = load volatile i64* %addr_sxtwN
store volatile i64 %val_sxtwN, i64* @var_64bit
-; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #3]
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw #3]
%addr_lslN = getelementptr i64* %base, i64 %off64
%val_lslN = load volatile i64* %addr_lslN
@@ -163,7 +163,7 @@ define void @ldst_64bit(i64* %base, i32 %off32, i64 %off64) {
%val8_uxtw = load volatile i64* %addr_uxtw
%newval8 = add i64 %val8_uxtw, 1
store volatile i64 %newval8, i64* @var_64bit
-; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, uxtw]
%base_sxtw = ptrtoint i64* %base to i64
%offset_sxtw = sext i32 %off32 to i64
@@ -171,7 +171,7 @@ define void @ldst_64bit(i64* %base, i32 %off32, i64 %off64) {
%addr_sxtw = inttoptr i64 %addrint_sxtw to i64*
%val64_sxtw = load volatile i64* %addr_sxtw
store volatile i64 %val64_sxtw, i64* @var_64bit
-; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw]
%base_lsl = ptrtoint i64* %base to i64
%addrint_lsl = add i64 %base_lsl, %off64
@@ -187,17 +187,17 @@ define void @ldst_64bit(i64* %base, i32 %off32, i64 %off64) {
%addr_uxtwN = inttoptr i64 %addrint_uxtwN to i64*
%val64 = load volatile i64* @var_64bit
store volatile i64 %val64, i64* %addr_uxtwN
-; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #3]
+; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, uxtw #3]
ret void
}
-define void @ldst_float(float* %base, i32 %off32, i64 %off64) {
+define void @ldst_float(float* %base, i32 %off32, i64 %off64) minsize {
; CHECK-LABEL: ldst_float:
%addr_sxtwN = getelementptr float* %base, i32 %off32
%val_sxtwN = load volatile float* %addr_sxtwN
store volatile float %val_sxtwN, float* @var_float
-; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #2]
+; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw #2]
; CHECK-NOFP-NOT: ldr {{s[0-9]+}},
%addr_lslN = getelementptr float* %base, i64 %off64
@@ -212,7 +212,7 @@ define void @ldst_float(float* %base, i32 %off32, i64 %off64) {
%addr_uxtw = inttoptr i64 %addrint1_uxtw to float*
%val_uxtw = load volatile float* %addr_uxtw
store volatile float %val_uxtw, float* @var_float
-; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, uxtw]
; CHECK-NOFP-NOT: ldr {{s[0-9]+}},
%base_sxtw = ptrtoint float* %base to i64
@@ -221,7 +221,7 @@ define void @ldst_float(float* %base, i32 %off32, i64 %off64) {
%addr_sxtw = inttoptr i64 %addrint_sxtw to float*
%val64_sxtw = load volatile float* %addr_sxtw
store volatile float %val64_sxtw, float* @var_float
-; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw]
; CHECK-NOFP-NOT: ldr {{s[0-9]+}},
%base_lsl = ptrtoint float* %base to i64
@@ -239,18 +239,18 @@ define void @ldst_float(float* %base, i32 %off32, i64 %off64) {
%addr_uxtwN = inttoptr i64 %addrint_uxtwN to float*
%val64 = load volatile float* @var_float
store volatile float %val64, float* %addr_uxtwN
-; CHECK: str {{s[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #2]
+; CHECK: str {{s[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, uxtw #2]
; CHECK-NOFP-NOT: ldr {{s[0-9]+}},
ret void
}
-define void @ldst_double(double* %base, i32 %off32, i64 %off64) {
+define void @ldst_double(double* %base, i32 %off32, i64 %off64) minsize {
; CHECK-LABEL: ldst_double:
%addr_sxtwN = getelementptr double* %base, i32 %off32
%val_sxtwN = load volatile double* %addr_sxtwN
store volatile double %val_sxtwN, double* @var_double
-; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #3]
+; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw #3]
; CHECK-NOFP-NOT: ldr {{d[0-9]+}},
%addr_lslN = getelementptr double* %base, i64 %off64
@@ -265,7 +265,7 @@ define void @ldst_double(double* %base, i32 %off32, i64 %off64) {
%addr_uxtw = inttoptr i64 %addrint1_uxtw to double*
%val_uxtw = load volatile double* %addr_uxtw
store volatile double %val_uxtw, double* @var_double
-; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, uxtw]
; CHECK-NOFP-NOT: ldr {{d[0-9]+}},
%base_sxtw = ptrtoint double* %base to i64
@@ -274,7 +274,7 @@ define void @ldst_double(double* %base, i32 %off32, i64 %off64) {
%addr_sxtw = inttoptr i64 %addrint_sxtw to double*
%val64_sxtw = load volatile double* %addr_sxtw
store volatile double %val64_sxtw, double* @var_double
-; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw]
; CHECK-NOFP-NOT: ldr {{d[0-9]+}},
%base_lsl = ptrtoint double* %base to i64
@@ -292,26 +292,26 @@ define void @ldst_double(double* %base, i32 %off32, i64 %off64) {
%addr_uxtwN = inttoptr i64 %addrint_uxtwN to double*
%val64 = load volatile double* @var_double
store volatile double %val64, double* %addr_uxtwN
-; CHECK: str {{d[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #3]
+; CHECK: str {{d[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, uxtw #3]
; CHECK-NOFP-NOT: ldr {{d[0-9]+}},
ret void
}
-define void @ldst_128bit(fp128* %base, i32 %off32, i64 %off64) {
+define void @ldst_128bit(fp128* %base, i32 %off32, i64 %off64) minsize {
; CHECK-LABEL: ldst_128bit:
%addr_sxtwN = getelementptr fp128* %base, i32 %off32
%val_sxtwN = load volatile fp128* %addr_sxtwN
store volatile fp128 %val_sxtwN, fp128* %base
-; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #4]
-; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #4]
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw #4]
+; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw #4]
%addr_lslN = getelementptr fp128* %base, i64 %off64
%val_lslN = load volatile fp128* %addr_lslN
store volatile fp128 %val_lslN, fp128* %base
; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{x[0-9]+}}, lsl #4]
-; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #4]
+; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw #4]
%addrint_uxtw = ptrtoint fp128* %base to i64
%offset_uxtw = zext i32 %off32 to i64
@@ -319,8 +319,8 @@ define void @ldst_128bit(fp128* %base, i32 %off32, i64 %off64) {
%addr_uxtw = inttoptr i64 %addrint1_uxtw to fp128*
%val_uxtw = load volatile fp128* %addr_uxtw
store volatile fp128 %val_uxtw, fp128* %base
-; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
-; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #4]
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, uxtw]
+; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw #4]
%base_sxtw = ptrtoint fp128* %base to i64
%offset_sxtw = sext i32 %off32 to i64
@@ -328,8 +328,8 @@ define void @ldst_128bit(fp128* %base, i32 %off32, i64 %off64) {
%addr_sxtw = inttoptr i64 %addrint_sxtw to fp128*
%val64_sxtw = load volatile fp128* %addr_sxtw
store volatile fp128 %val64_sxtw, fp128* %base
-; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
-; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #4]
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw]
+; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw #4]
%base_lsl = ptrtoint fp128* %base to i64
%addrint_lsl = add i64 %base_lsl, %off64
@@ -337,7 +337,7 @@ define void @ldst_128bit(fp128* %base, i32 %off32, i64 %off64) {
%val64_lsl = load volatile fp128* %addr_lsl
store volatile fp128 %val64_lsl, fp128* %base
; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{x[0-9]+}}]
-; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #4]
+; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw #4]
%base_uxtwN = ptrtoint fp128* %base to i64
%offset_uxtwN = zext i32 %off32 to i64
@@ -346,7 +346,7 @@ define void @ldst_128bit(fp128* %base, i32 %off32, i64 %off64) {
%addr_uxtwN = inttoptr i64 %addrint_uxtwN to fp128*
%val64 = load volatile fp128* %base
store volatile fp128 %val64, fp128* %addr_uxtwN
-; CHECK: str {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #4]
-; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #4]
+; CHECK: str {{q[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, uxtw #4]
+; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{[xw][0-9]+}}, sxtw #4]
ret void
}
diff --git a/test/CodeGen/AArch64/ldst-unscaledimm.ll b/test/CodeGen/AArch64/ldst-unscaledimm.ll
index bea5bb5..1de8443 100644
--- a/test/CodeGen/AArch64/ldst-unscaledimm.ll
+++ b/test/CodeGen/AArch64/ldst-unscaledimm.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
@var_8bit = global i8 0
@@ -160,7 +160,7 @@ define void @ldst_32bit() {
%val64_unsigned = zext i32 %val32_zext to i64
store volatile i64 %val64_unsigned, i64* @var_64bit
; CHECK: ldur {{w[0-9]+}}, [{{x[0-9]+}}, #-256]
-; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_64bit]
+; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_64bit]
; Sign-extension to 64-bits
%addr32_8_sext = getelementptr i8* %addr_8bit, i64 -12
@@ -169,7 +169,7 @@ define void @ldst_32bit() {
%val64_signed = sext i32 %val32_sext to i64
store volatile i64 %val64_signed, i64* @var_64bit
; CHECK: ldursw {{x[0-9]+}}, [{{x[0-9]+}}, #-12]
-; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_64bit]
+; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_64bit]
; Truncation from 64-bits
%addr64_8_trunc = getelementptr i8* %addr_8bit, i64 255
diff --git a/test/CodeGen/AArch64/ldst-unsignedimm.ll b/test/CodeGen/AArch64/ldst-unsignedimm.ll
index 44c1586..e171d22 100644
--- a/test/CodeGen/AArch64/ldst-unsignedimm.ll
+++ b/test/CodeGen/AArch64/ldst-unsignedimm.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
@var_8bit = global i8 0
@@ -20,25 +20,25 @@ define void @ldst_8bit() {
%val32_signed = sext i8 %val8_sext32 to i32
store volatile i32 %val32_signed, i32* @var_32bit
; CHECK: adrp {{x[0-9]+}}, var_8bit
-; CHECK: ldrsb {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_8bit]
+; CHECK: ldrsb {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_8bit]
; match a zero-extending load volatile 8-bit -> 32-bit
%val8_zext32 = load volatile i8* @var_8bit
%val32_unsigned = zext i8 %val8_zext32 to i32
store volatile i32 %val32_unsigned, i32* @var_32bit
-; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_8bit]
+; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_8bit]
; match an any-extending load volatile 8-bit -> 32-bit
%val8_anyext = load volatile i8* @var_8bit
%newval8 = add i8 %val8_anyext, 1
store volatile i8 %newval8, i8* @var_8bit
-; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_8bit]
+; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_8bit]
; match a sign-extending load volatile 8-bit -> 64-bit
%val8_sext64 = load volatile i8* @var_8bit
%val64_signed = sext i8 %val8_sext64 to i64
store volatile i64 %val64_signed, i64* @var_64bit
-; CHECK: ldrsb {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_8bit]
+; CHECK: ldrsb {{x[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_8bit]
; match a zero-extending load volatile 8-bit -> 64-bit.
; This uses the fact that ldrb w0, [x0] will zero out the high 32-bits
@@ -46,19 +46,19 @@ define void @ldst_8bit() {
%val8_zext64 = load volatile i8* @var_8bit
%val64_unsigned = zext i8 %val8_zext64 to i64
store volatile i64 %val64_unsigned, i64* @var_64bit
-; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_8bit]
+; CHECK: ldrb {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_8bit]
; truncating store volatile 32-bits to 8-bits
%val32 = load volatile i32* @var_32bit
%val8_trunc32 = trunc i32 %val32 to i8
store volatile i8 %val8_trunc32, i8* @var_8bit
-; CHECK: strb {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_8bit]
+; CHECK: strb {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_8bit]
; truncating store volatile 64-bits to 8-bits
%val64 = load volatile i64* @var_64bit
%val8_trunc64 = trunc i64 %val64 to i8
store volatile i8 %val8_trunc64, i8* @var_8bit
-; CHECK: strb {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_8bit]
+; CHECK: strb {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_8bit]
ret void
}
@@ -74,25 +74,25 @@ define void @ldst_16bit() {
%val32_signed = sext i16 %val16_sext32 to i32
store volatile i32 %val32_signed, i32* @var_32bit
; CHECK: adrp {{x[0-9]+}}, var_16bit
-; CHECK: ldrsh {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_16bit]
+; CHECK: ldrsh {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_16bit]
; match a zero-extending load volatile 16-bit -> 32-bit
%val16_zext32 = load volatile i16* @var_16bit
%val32_unsigned = zext i16 %val16_zext32 to i32
store volatile i32 %val32_unsigned, i32* @var_32bit
-; CHECK: ldrh {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_16bit]
+; CHECK: ldrh {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_16bit]
; match an any-extending load volatile 16-bit -> 32-bit
%val16_anyext = load volatile i16* @var_16bit
%newval16 = add i16 %val16_anyext, 1
store volatile i16 %newval16, i16* @var_16bit
-; CHECK: ldrh {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_16bit]
+; CHECK: ldrh {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_16bit]
; match a sign-extending load volatile 16-bit -> 64-bit
%val16_sext64 = load volatile i16* @var_16bit
%val64_signed = sext i16 %val16_sext64 to i64
store volatile i64 %val64_signed, i64* @var_64bit
-; CHECK: ldrsh {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_16bit]
+; CHECK: ldrsh {{x[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_16bit]
; match a zero-extending load volatile 16-bit -> 64-bit.
; This uses the fact that ldrb w0, [x0] will zero out the high 32-bits
@@ -100,19 +100,19 @@ define void @ldst_16bit() {
%val16_zext64 = load volatile i16* @var_16bit
%val64_unsigned = zext i16 %val16_zext64 to i64
store volatile i64 %val64_unsigned, i64* @var_64bit
-; CHECK: ldrh {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_16bit]
+; CHECK: ldrh {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_16bit]
; truncating store volatile 32-bits to 16-bits
%val32 = load volatile i32* @var_32bit
%val16_trunc32 = trunc i32 %val32 to i16
store volatile i16 %val16_trunc32, i16* @var_16bit
-; CHECK: strh {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_16bit]
+; CHECK: strh {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_16bit]
; truncating store volatile 64-bits to 16-bits
%val64 = load volatile i64* @var_64bit
%val16_trunc64 = trunc i64 %val64 to i16
store volatile i16 %val16_trunc64, i16* @var_16bit
-; CHECK: strh {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_16bit]
+; CHECK: strh {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_16bit]
ret void
}
@@ -124,29 +124,29 @@ define void @ldst_32bit() {
%val32_noext = load volatile i32* @var_32bit
store volatile i32 %val32_noext, i32* @var_32bit
; CHECK: adrp {{x[0-9]+}}, var_32bit
-; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_32bit]
-; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_32bit]
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_32bit]
+; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_32bit]
; Zero-extension to 64-bits
%val32_zext = load volatile i32* @var_32bit
%val64_unsigned = zext i32 %val32_zext to i64
store volatile i64 %val64_unsigned, i64* @var_64bit
-; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_32bit]
-; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_64bit]
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_32bit]
+; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_64bit]
; Sign-extension to 64-bits
%val32_sext = load volatile i32* @var_32bit
%val64_signed = sext i32 %val32_sext to i64
store volatile i64 %val64_signed, i64* @var_64bit
-; CHECK: ldrsw {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_32bit]
-; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_64bit]
+; CHECK: ldrsw {{x[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_32bit]
+; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_64bit]
; Truncation from 64-bits
%val64_trunc = load volatile i64* @var_64bit
%val32_trunc = trunc i64 %val64_trunc to i32
store volatile i32 %val32_trunc, i32* @var_32bit
-; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_64bit]
-; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_32bit]
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_64bit]
+; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_32bit]
ret void
}
@@ -165,7 +165,7 @@ define void @ldst_complex_offsets() {
; CHECK: ldst_complex_offsets
%arr8_addr = load volatile i8** @arr8
; CHECK: adrp {{x[0-9]+}}, arr8
-; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:arr8]
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:arr8]
%arr8_sub1_addr = getelementptr i8* %arr8_addr, i64 1
%arr8_sub1 = load volatile i8* %arr8_sub1_addr
@@ -180,7 +180,7 @@ define void @ldst_complex_offsets() {
%arr16_addr = load volatile i16** @arr16
; CHECK: adrp {{x[0-9]+}}, arr16
-; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:arr16]
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:arr16]
%arr16_sub1_addr = getelementptr i16* %arr16_addr, i64 1
%arr16_sub1 = load volatile i16* %arr16_sub1_addr
@@ -195,7 +195,7 @@ define void @ldst_complex_offsets() {
%arr32_addr = load volatile i32** @arr32
; CHECK: adrp {{x[0-9]+}}, arr32
-; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:arr32]
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:arr32]
%arr32_sub1_addr = getelementptr i32* %arr32_addr, i64 1
%arr32_sub1 = load volatile i32* %arr32_sub1_addr
@@ -210,7 +210,7 @@ define void @ldst_complex_offsets() {
%arr64_addr = load volatile i64** @arr64
; CHECK: adrp {{x[0-9]+}}, arr64
-; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, #:lo12:arr64]
+; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:arr64]
%arr64_sub1_addr = getelementptr i64* %arr64_addr, i64 1
%arr64_sub1 = load volatile i64* %arr64_sub1_addr
@@ -230,11 +230,11 @@ define void @ldst_float() {
%valfp = load volatile float* @var_float
; CHECK: adrp {{x[0-9]+}}, var_float
-; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_float]
+; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_float]
; CHECK-NOFP-NOT: ldr {{s[0-9]+}},
store volatile float %valfp, float* @var_float
-; CHECK: str {{s[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_float]
+; CHECK: str {{s[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_float]
; CHECK-NOFP-NOT: str {{s[0-9]+}},
ret void
@@ -245,11 +245,11 @@ define void @ldst_double() {
%valfp = load volatile double* @var_double
; CHECK: adrp {{x[0-9]+}}, var_double
-; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_double]
+; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_double]
; CHECK-NOFP-NOT: ldr {{d[0-9]+}},
store volatile double %valfp, double* @var_double
-; CHECK: str {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_double]
+; CHECK: str {{d[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:var_double]
; CHECK-NOFP-NOT: str {{d[0-9]+}},
ret void
diff --git a/test/CodeGen/AArch64/lit.local.cfg b/test/CodeGen/AArch64/lit.local.cfg
index 9a66a00..77493d8 100644
--- a/test/CodeGen/AArch64/lit.local.cfg
+++ b/test/CodeGen/AArch64/lit.local.cfg
@@ -1,4 +1,11 @@
+import re
+
+config.suffixes = ['.ll']
+
targets = set(config.root.targets_to_build.split())
if not 'AArch64' in targets:
config.unsupported = True
+# For now we don't test arm64-win32.
+if re.search(r'cygwin|mingw32|win32', config.target_triple):
+ config.unsupported = True
diff --git a/test/CodeGen/AArch64/literal_pools.ll b/test/CodeGen/AArch64/literal_pools.ll
deleted file mode 100644
index fc33aee..0000000
--- a/test/CodeGen/AArch64/literal_pools.ll
+++ /dev/null
@@ -1,103 +0,0 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -code-model=large | FileCheck --check-prefix=CHECK-LARGE %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -code-model=large -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP-LARGE %s
-
-@var32 = global i32 0
-@var64 = global i64 0
-
-define void @foo() {
-; CHECK-LABEL: foo:
- %val32 = load i32* @var32
- %val64 = load i64* @var64
-
- %val32_lit32 = and i32 %val32, 123456785
- store volatile i32 %val32_lit32, i32* @var32
-; CHECK: adrp x[[LITBASE:[0-9]+]], [[CURLIT:.LCPI0_[0-9]+]]
-; CHECK: ldr {{w[0-9]+}}, [x[[LITBASE]], #:lo12:[[CURLIT]]]
-
-; CHECK-LARGE: movz x[[LITADDR:[0-9]+]], #:abs_g3:[[CURLIT:.LCPI0_[0-9]+]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g2_nc:[[CURLIT]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g1_nc:[[CURLIT]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g0_nc:[[CURLIT]]
-; CHECK-LARGE: ldr {{w[0-9]+}}, [x[[LITADDR]]]
-
- %val64_lit32 = and i64 %val64, 305402420
- store volatile i64 %val64_lit32, i64* @var64
-; CHECK: adrp x[[LITBASE:[0-9]+]], [[CURLIT:.LCPI0_[0-9]+]]
-; CHECK: ldr {{w[0-9]+}}, [x[[LITBASE]], #:lo12:[[CURLIT]]]
-
-; CHECK-LARGE: movz x[[LITADDR:[0-9]+]], #:abs_g3:[[CURLIT:.LCPI0_[0-9]+]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g2_nc:[[CURLIT]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g1_nc:[[CURLIT]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g0_nc:[[CURLIT]]
-; CHECK-LARGE: ldr {{w[0-9]+}}, [x[[LITADDR]]]
-
- %val64_lit32signed = and i64 %val64, -12345678
- store volatile i64 %val64_lit32signed, i64* @var64
-; CHECK: adrp x[[LITBASE:[0-9]+]], [[CURLIT:.LCPI0_[0-9]+]]
-; CHECK: ldrsw {{x[0-9]+}}, [x[[LITBASE]], #:lo12:[[CURLIT]]]
-
-; CHECK-LARGE: movz x[[LITADDR:[0-9]+]], #:abs_g3:[[CURLIT:.LCPI0_[0-9]+]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g2_nc:[[CURLIT]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g1_nc:[[CURLIT]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g0_nc:[[CURLIT]]
-; CHECK-LARGE: ldrsw {{x[0-9]+}}, [x[[LITADDR]]]
-
- %val64_lit64 = and i64 %val64, 1234567898765432
- store volatile i64 %val64_lit64, i64* @var64
-; CHECK: adrp x[[LITBASE:[0-9]+]], [[CURLIT:.LCPI0_[0-9]+]]
-; CHECK: ldr {{x[0-9]+}}, [x[[LITBASE]], #:lo12:[[CURLIT]]]
-
-; CHECK-LARGE: movz x[[LITADDR:[0-9]+]], #:abs_g3:[[CURLIT:.LCPI0_[0-9]+]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g2_nc:[[CURLIT]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g1_nc:[[CURLIT]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g0_nc:[[CURLIT]]
-; CHECK-LARGE: ldr {{x[0-9]+}}, [x[[LITADDR]]]
-
- ret void
-}
-
-@varfloat = global float 0.0
-@vardouble = global double 0.0
-
-define void @floating_lits() {
-; CHECK-LABEL: floating_lits:
-
- %floatval = load float* @varfloat
- %newfloat = fadd float %floatval, 128.0
-; CHECK: adrp x[[LITBASE:[0-9]+]], [[CURLIT:.LCPI1_[0-9]+]]
-; CHECK: ldr [[LIT128:s[0-9]+]], [x[[LITBASE]], #:lo12:[[CURLIT]]]
-; CHECK-NOFP-NOT: ldr {{s[0-9]+}},
-
-; CHECK-LARGE: movz x[[LITADDR:[0-9]+]], #:abs_g3:[[CURLIT:.LCPI1_[0-9]+]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g2_nc:[[CURLIT]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g1_nc:[[CURLIT]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g0_nc:[[CURLIT]]
-; CHECK-LARGE: ldr {{s[0-9]+}}, [x[[LITADDR]]]
-; CHECK-LARGE: fadd
-; CHECK-NOFP-LARGE-NOT: ldr {{s[0-9]+}},
-; CHECK-NOFP-LARGE-NOT: fadd
-
- store float %newfloat, float* @varfloat
-
- %doubleval = load double* @vardouble
- %newdouble = fadd double %doubleval, 129.0
-; CHECK: adrp x[[LITBASE:[0-9]+]], [[CURLIT:.LCPI1_[0-9]+]]
-; CHECK: ldr [[LIT129:d[0-9]+]], [x[[LITBASE]], #:lo12:[[CURLIT]]]
-; CHECK: fadd {{s[0-9]+}}, {{s[0-9]+}}, [[LIT128]]
-; CHECK: fadd {{d[0-9]+}}, {{d[0-9]+}}, [[LIT129]]
-; CHECK-NOFP-NOT: ldr {{d[0-9]+}},
-; CHECK-NOFP-NOT: fadd
-
-; CHECK-LARGE: movz x[[LITADDR:[0-9]+]], #:abs_g3:[[CURLIT:.LCPI1_[0-9]+]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g2_nc:[[CURLIT]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g1_nc:[[CURLIT]]
-; CHECK-LARGE: movk x[[LITADDR]], #:abs_g0_nc:[[CURLIT]]
-; CHECK-LARGE: ldr {{d[0-9]+}}, [x[[LITADDR]]]
-; CHECK-NOFP-LARGE-NOT: ldr {{d[0-9]+}},
-
- store double %newdouble, double* @vardouble
-
- ret void
-}
diff --git a/test/CodeGen/AArch64/literal_pools_float.ll b/test/CodeGen/AArch64/literal_pools_float.ll
new file mode 100644
index 0000000..e53b8b6
--- /dev/null
+++ b/test/CodeGen/AArch64/literal_pools_float.ll
@@ -0,0 +1,46 @@
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -mcpu=cyclone | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -code-model=large -mcpu=cyclone | FileCheck --check-prefix=CHECK-LARGE %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -code-model=large -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP-LARGE %s
+
+@varfloat = global float 0.0
+@vardouble = global double 0.0
+
+define void @floating_lits() {
+; CHECK-LABEL: floating_lits:
+
+ %floatval = load float* @varfloat
+ %newfloat = fadd float %floatval, 128.0
+; CHECK: adrp x[[LITBASE:[0-9]+]], [[CURLIT:.LCPI[0-9]+_[0-9]+]]
+; CHECK: ldr [[LIT128:s[0-9]+]], [x[[LITBASE]], {{#?}}:lo12:[[CURLIT]]]
+; CHECK-NOFP-NOT: ldr {{s[0-9]+}},
+
+; CHECK-LARGE: movz x[[LITADDR:[0-9]+]], #:abs_g3:[[CURLIT:.LCPI[0-9]+_[0-9]+]]
+; CHECK-LARGE: movk x[[LITADDR]], #:abs_g2_nc:[[CURLIT]]
+; CHECK-LARGE: movk x[[LITADDR]], #:abs_g1_nc:[[CURLIT]]
+; CHECK-LARGE: movk x[[LITADDR]], #:abs_g0_nc:[[CURLIT]]
+; CHECK-LARGE: ldr {{s[0-9]+}}, [x[[LITADDR]]]
+; CHECK-LARGE: fadd
+; CHECK-NOFP-LARGE-NOT: ldr {{s[0-9]+}},
+; CHECK-NOFP-LARGE-NOT: fadd
+
+ store float %newfloat, float* @varfloat
+
+ %doubleval = load double* @vardouble
+ %newdouble = fadd double %doubleval, 129.0
+; CHECK: adrp x[[LITBASE:[0-9]+]], [[CURLIT:.LCPI[0-9]+_[0-9]+]]
+; CHECK: ldr [[LIT129:d[0-9]+]], [x[[LITBASE]], {{#?}}:lo12:[[CURLIT]]]
+; CHECK-NOFP-NOT: ldr {{d[0-9]+}},
+; CHECK-NOFP-NOT: fadd
+
+; CHECK-LARGE: movz x[[LITADDR:[0-9]+]], #:abs_g3:[[CURLIT:.LCPI[0-9]+_[0-9]+]]
+; CHECK-LARGE: movk x[[LITADDR]], #:abs_g2_nc:[[CURLIT]]
+; CHECK-LARGE: movk x[[LITADDR]], #:abs_g1_nc:[[CURLIT]]
+; CHECK-LARGE: movk x[[LITADDR]], #:abs_g0_nc:[[CURLIT]]
+; CHECK-LARGE: ldr {{d[0-9]+}}, [x[[LITADDR]]]
+; CHECK-NOFP-LARGE-NOT: ldr {{d[0-9]+}},
+
+ store double %newdouble, double* @vardouble
+
+ ret void
+}
diff --git a/test/CodeGen/AArch64/local_vars.ll b/test/CodeGen/AArch64/local_vars.ll
index b5cef85..2f5b9f2 100644
--- a/test/CodeGen/AArch64/local_vars.ll
+++ b/test/CodeGen/AArch64/local_vars.ll
@@ -1,5 +1,5 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -O0 | FileCheck %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -O0 -disable-fp-elim | FileCheck -check-prefix CHECK-WITHFP %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -disable-fp-elim | FileCheck -check-prefix CHECK-WITHFP-ARM64 %s
; Make sure a reasonably sane prologue and epilogue are
; generated. This test is not robust in the face of an frame-handling
@@ -16,7 +16,7 @@
declare void @foo()
define void @trivial_func() nounwind {
-; CHECK: trivial_func: // @trivial_func
+; CHECK-LABEL: trivial_func: // @trivial_func
; CHECK-NEXT: // BB#0
; CHECK-NEXT: ret
@@ -24,11 +24,14 @@ define void @trivial_func() nounwind {
}
define void @trivial_fp_func() {
-; CHECK-WITHFP-LABEL: trivial_fp_func:
+; CHECK-WITHFP-AARCH64-LABEL: trivial_fp_func:
+; CHECK-WITHFP-AARCH64: sub sp, sp, #16
+; CHECK-WITHFP-AARCH64: stp x29, x30, [sp]
+; CHECK-WITHFP-AARCH64-NEXT: mov x29, sp
-; CHECK-WITHFP: sub sp, sp, #16
-; CHECK-WITHFP: stp x29, x30, [sp]
-; CHECK-WITHFP-NEXT: mov x29, sp
+; CHECK-WITHFP-ARM64-LABEL: trivial_fp_func:
+; CHECK-WITHFP-ARM64: stp x29, x30, [sp, #-16]!
+; CHECK-WITHFP-ARM64-NEXT: mov x29, sp
; Dont't really care, but it would be a Bad Thing if this came after the epilogue.
; CHECK: bl foo
@@ -48,10 +51,10 @@ define void @stack_local() {
%val = load i64* @var
store i64 %val, i64* %local_var
-; CHECK: str {{x[0-9]+}}, [sp, #{{[0-9]+}}]
+; CHECK-DAG: str {{x[0-9]+}}, [sp, #{{[0-9]+}}]
store i64* %local_var, i64** @local_addr
-; CHECK: add {{x[0-9]+}}, sp, #{{[0-9]+}}
+; CHECK-DAG: add {{x[0-9]+}}, sp, #{{[0-9]+}}
ret void
}
diff --git a/test/CodeGen/AArch64/logical-imm.ll b/test/CodeGen/AArch64/logical-imm.ll
index e04bb51..a5e4a99 100644
--- a/test/CodeGen/AArch64/logical-imm.ll
+++ b/test/CodeGen/AArch64/logical-imm.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
@var32 = global i32 0
@var64 = global i64 0
diff --git a/test/CodeGen/AArch64/logical_shifted_reg.ll b/test/CodeGen/AArch64/logical_shifted_reg.ll
index a08ba20..b249d72 100644
--- a/test/CodeGen/AArch64/logical_shifted_reg.ll
+++ b/test/CodeGen/AArch64/logical_shifted_reg.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -O0 | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
@var1_32 = global i32 0
@var2_32 = global i32 0
@@ -6,7 +6,7 @@
@var1_64 = global i64 0
@var2_64 = global i64 0
-define void @logical_32bit() {
+define void @logical_32bit() minsize {
; CHECK-LABEL: logical_32bit:
%val1 = load i32* @var1_32
%val2 = load i32* @var2_32
@@ -96,7 +96,7 @@ define void @logical_32bit() {
ret void
}
-define void @logical_64bit() {
+define void @logical_64bit() minsize {
; CHECK-LABEL: logical_64bit:
%val1 = load i64* @var1_64
%val2 = load i64* @var2_64
diff --git a/test/CodeGen/AArch64/mature-mc-support.ll b/test/CodeGen/AArch64/mature-mc-support.ll
index 06e3cc7..276c54d 100644
--- a/test/CodeGen/AArch64/mature-mc-support.ll
+++ b/test/CodeGen/AArch64/mature-mc-support.ll
@@ -1,11 +1,11 @@
; Test that inline assembly is parsed by the MC layer when MC support is mature
; (even when the output is assembly).
-; RUN: not llc -mtriple=aarch64-pc-linux < %s > /dev/null 2> %t1
-; RUN: FileCheck %s < %t1
+; RUN: not llc -mtriple=aarch64-pc-linux < %s > /dev/null 2> %t3
+; RUN: FileCheck %s < %t3
-; RUN: not llc -mtriple=aarch64-pc-linux -filetype=obj < %s > /dev/null 2> %t2
-; RUN: FileCheck %s < %t2
+; RUN: not llc -mtriple=aarch64-pc-linux -filetype=obj < %s > /dev/null 2> %t4
+; RUN: FileCheck %s < %t4
module asm " .this_directive_is_very_unlikely_to_exist"
diff --git a/test/CodeGen/AArch64/movw-consts.ll b/test/CodeGen/AArch64/movw-consts.ll
index 38e37db..93c1812 100644
--- a/test/CodeGen/AArch64/movw-consts.ll
+++ b/test/CodeGen/AArch64/movw-consts.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -O0 < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s --check-prefix=CHECK
define i64 @test0() {
; CHECK-LABEL: test0:
@@ -9,43 +9,43 @@ define i64 @test0() {
define i64 @test1() {
; CHECK-LABEL: test1:
-; CHECK: movz x0, #1
+; CHECK: orr w0, wzr, #0x1
ret i64 1
}
define i64 @test2() {
; CHECK-LABEL: test2:
-; CHECK: movz x0, #65535
+; CHECK: orr w0, wzr, #0xffff
ret i64 65535
}
define i64 @test3() {
; CHECK-LABEL: test3:
-; CHECK: movz x0, #1, lsl #16
+; CHECK: orr w0, wzr, #0x10000
ret i64 65536
}
define i64 @test4() {
; CHECK-LABEL: test4:
-; CHECK: movz x0, #65535, lsl #16
+; CHECK: orr w0, wzr, #0xffff0000
ret i64 4294901760
}
define i64 @test5() {
; CHECK-LABEL: test5:
-; CHECK: movz x0, #1, lsl #32
+; CHECK: orr x0, xzr, #0x100000000
ret i64 4294967296
}
define i64 @test6() {
; CHECK-LABEL: test6:
-; CHECK: movz x0, #65535, lsl #32
+; CHECK: orr x0, xzr, #0xffff00000000
ret i64 281470681743360
}
define i64 @test7() {
; CHECK-LABEL: test7:
-; CHECK: movz x0, #1, lsl #48
+; CHECK: orr x0, xzr, #0x1000000000000
ret i64 281474976710656
}
@@ -53,7 +53,7 @@ define i64 @test7() {
; couldn't. Useful even for i64
define i64 @test8() {
; CHECK-LABEL: test8:
-; CHECK: movn w0, #60875
+; CHECK: movn w0, #{{60875|0xedcb}}
ret i64 4294906420
}
@@ -65,7 +65,7 @@ define i64 @test9() {
define i64 @test10() {
; CHECK-LABEL: test10:
-; CHECK: movn x0, #60875, lsl #16
+; CHECK: movn x0, #{{60875|0xedcb}}, lsl #16
ret i64 18446744069720047615
}
@@ -75,35 +75,35 @@ define i64 @test10() {
define void @test11() {
; CHECK-LABEL: test11:
-; CHECK: mov {{w[0-9]+}}, wzr
+; CHECK: str wzr
store i32 0, i32* @var32
ret void
}
define void @test12() {
; CHECK-LABEL: test12:
-; CHECK: movz {{w[0-9]+}}, #1
+; CHECK: orr {{w[0-9]+}}, wzr, #0x1
store i32 1, i32* @var32
ret void
}
define void @test13() {
; CHECK-LABEL: test13:
-; CHECK: movz {{w[0-9]+}}, #65535
+; CHECK: orr {{w[0-9]+}}, wzr, #0xffff
store i32 65535, i32* @var32
ret void
}
define void @test14() {
; CHECK-LABEL: test14:
-; CHECK: movz {{w[0-9]+}}, #1, lsl #16
+; CHECK: orr {{w[0-9]+}}, wzr, #0x10000
store i32 65536, i32* @var32
ret void
}
define void @test15() {
; CHECK-LABEL: test15:
-; CHECK: movz {{w[0-9]+}}, #65535, lsl #16
+; CHECK: orr {{w[0-9]+}}, wzr, #0xffff0000
store i32 4294901760, i32* @var32
ret void
}
@@ -119,6 +119,6 @@ define i64 @test17() {
; CHECK-LABEL: test17:
; Mustn't MOVN w0 here.
-; CHECK: movn x0, #2
+; CHECK: orr x0, xzr, #0xfffffffffffffffd
ret i64 -3
}
diff --git a/test/CodeGen/AArch64/movw-shift-encoding.ll b/test/CodeGen/AArch64/movw-shift-encoding.ll
index ec133bd..178fccc 100644
--- a/test/CodeGen/AArch64/movw-shift-encoding.ll
+++ b/test/CodeGen/AArch64/movw-shift-encoding.ll
@@ -7,8 +7,9 @@
define i32* @get_var() {
ret i32* @var
-; CHECK: movz x0, #:abs_g3:var // encoding: [A,A,0xe0'A',0xd2'A']
-; CHECK: movk x0, #:abs_g2_nc:var // encoding: [A,A,0xc0'A',0xf2'A']
-; CHECK: movk x0, #:abs_g1_nc:var // encoding: [A,A,0xa0'A',0xf2'A']
-; CHECK: movk x0, #:abs_g0_nc:var // encoding: [A,A,0x80'A',0xf2'A']
+
+; CHECK: movz x0, #:abs_g3:var // encoding: [0bAAA00000,A,0b111AAAAA,0xd2]
+; CHECK: movk x0, #:abs_g2_nc:var // encoding: [0bAAA00000,A,0b110AAAAA,0xf2]
+; CHECK: movk x0, #:abs_g1_nc:var // encoding: [0bAAA00000,A,0b101AAAAA,0xf2]
+; CHECK: movk x0, #:abs_g0_nc:var // encoding: [0bAAA00000,A,0b100AAAAA,0xf2]
}
diff --git a/test/CodeGen/AArch64/mul-lohi.ll b/test/CodeGen/AArch64/mul-lohi.ll
index f58c598..0689fbd 100644
--- a/test/CodeGen/AArch64/mul-lohi.ll
+++ b/test/CodeGen/AArch64/mul-lohi.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=aarch64-linux-gnu %s -o - | FileCheck %s
-; RUN: llc -mtriple=aarch64_be-linux-gnu %s -o - | FileCheck --check-prefix=CHECK-BE %s
+; RUN: llc -mtriple=arm64-apple-ios7.0 %s -o - | FileCheck %s
+; RUN: llc -mtriple=arm64_be-linux-gnu %s -o - | FileCheck --check-prefix=CHECK-BE %s
define i128 @test_128bitmul(i128 %lhs, i128 %rhs) {
; CHECK-LABEL: test_128bitmul:
diff --git a/test/CodeGen/AArch64/neon-across.ll b/test/CodeGen/AArch64/neon-across.ll
deleted file mode 100644
index 6d30c95..0000000
--- a/test/CodeGen/AArch64/neon-across.ll
+++ /dev/null
@@ -1,472 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-declare float @llvm.aarch64.neon.vminnmv(<4 x float>)
-
-declare float @llvm.aarch64.neon.vmaxnmv(<4 x float>)
-
-declare float @llvm.aarch64.neon.vminv(<4 x float>)
-
-declare float @llvm.aarch64.neon.vmaxv(<4 x float>)
-
-declare <1 x i32> @llvm.aarch64.neon.vaddv.v1i32.v4i32(<4 x i32>)
-
-declare <1 x i16> @llvm.aarch64.neon.vaddv.v1i16.v8i16(<8 x i16>)
-
-declare <1 x i8> @llvm.aarch64.neon.vaddv.v1i8.v16i8(<16 x i8>)
-
-declare <1 x i16> @llvm.aarch64.neon.vaddv.v1i16.v4i16(<4 x i16>)
-
-declare <1 x i8> @llvm.aarch64.neon.vaddv.v1i8.v8i8(<8 x i8>)
-
-declare <1 x i32> @llvm.aarch64.neon.uminv.v1i32.v4i32(<4 x i32>)
-
-declare <1 x i16> @llvm.aarch64.neon.uminv.v1i16.v8i16(<8 x i16>)
-
-declare <1 x i8> @llvm.aarch64.neon.uminv.v1i8.v16i8(<16 x i8>)
-
-declare <1 x i32> @llvm.aarch64.neon.sminv.v1i32.v4i32(<4 x i32>)
-
-declare <1 x i16> @llvm.aarch64.neon.sminv.v1i16.v8i16(<8 x i16>)
-
-declare <1 x i8> @llvm.aarch64.neon.sminv.v1i8.v16i8(<16 x i8>)
-
-declare <1 x i16> @llvm.aarch64.neon.uminv.v1i16.v4i16(<4 x i16>)
-
-declare <1 x i8> @llvm.aarch64.neon.uminv.v1i8.v8i8(<8 x i8>)
-
-declare <1 x i16> @llvm.aarch64.neon.sminv.v1i16.v4i16(<4 x i16>)
-
-declare <1 x i8> @llvm.aarch64.neon.sminv.v1i8.v8i8(<8 x i8>)
-
-declare <1 x i32> @llvm.aarch64.neon.umaxv.v1i32.v4i32(<4 x i32>)
-
-declare <1 x i16> @llvm.aarch64.neon.umaxv.v1i16.v8i16(<8 x i16>)
-
-declare <1 x i8> @llvm.aarch64.neon.umaxv.v1i8.v16i8(<16 x i8>)
-
-declare <1 x i32> @llvm.aarch64.neon.smaxv.v1i32.v4i32(<4 x i32>)
-
-declare <1 x i16> @llvm.aarch64.neon.smaxv.v1i16.v8i16(<8 x i16>)
-
-declare <1 x i8> @llvm.aarch64.neon.smaxv.v1i8.v16i8(<16 x i8>)
-
-declare <1 x i16> @llvm.aarch64.neon.umaxv.v1i16.v4i16(<4 x i16>)
-
-declare <1 x i8> @llvm.aarch64.neon.umaxv.v1i8.v8i8(<8 x i8>)
-
-declare <1 x i16> @llvm.aarch64.neon.smaxv.v1i16.v4i16(<4 x i16>)
-
-declare <1 x i8> @llvm.aarch64.neon.smaxv.v1i8.v8i8(<8 x i8>)
-
-declare <1 x i64> @llvm.aarch64.neon.uaddlv.v1i64.v4i32(<4 x i32>)
-
-declare <1 x i32> @llvm.aarch64.neon.uaddlv.v1i32.v8i16(<8 x i16>)
-
-declare <1 x i16> @llvm.aarch64.neon.uaddlv.v1i16.v16i8(<16 x i8>)
-
-declare <1 x i64> @llvm.aarch64.neon.saddlv.v1i64.v4i32(<4 x i32>)
-
-declare <1 x i32> @llvm.aarch64.neon.saddlv.v1i32.v8i16(<8 x i16>)
-
-declare <1 x i16> @llvm.aarch64.neon.saddlv.v1i16.v16i8(<16 x i8>)
-
-declare <1 x i32> @llvm.aarch64.neon.uaddlv.v1i32.v4i16(<4 x i16>)
-
-declare <1 x i16> @llvm.aarch64.neon.uaddlv.v1i16.v8i8(<8 x i8>)
-
-declare <1 x i32> @llvm.aarch64.neon.saddlv.v1i32.v4i16(<4 x i16>)
-
-declare <1 x i16> @llvm.aarch64.neon.saddlv.v1i16.v8i8(<8 x i8>)
-
-define i16 @test_vaddlv_s8(<8 x i8> %a) {
-; CHECK: test_vaddlv_s8:
-; CHECK: saddlv h{{[0-9]+}}, {{v[0-9]+}}.8b
-entry:
- %saddlv.i = tail call <1 x i16> @llvm.aarch64.neon.saddlv.v1i16.v8i8(<8 x i8> %a)
- %0 = extractelement <1 x i16> %saddlv.i, i32 0
- ret i16 %0
-}
-
-define i32 @test_vaddlv_s16(<4 x i16> %a) {
-; CHECK: test_vaddlv_s16:
-; CHECK: saddlv s{{[0-9]+}}, {{v[0-9]+}}.4h
-entry:
- %saddlv.i = tail call <1 x i32> @llvm.aarch64.neon.saddlv.v1i32.v4i16(<4 x i16> %a)
- %0 = extractelement <1 x i32> %saddlv.i, i32 0
- ret i32 %0
-}
-
-define i16 @test_vaddlv_u8(<8 x i8> %a) {
-; CHECK: test_vaddlv_u8:
-; CHECK: uaddlv h{{[0-9]+}}, {{v[0-9]+}}.8b
-entry:
- %uaddlv.i = tail call <1 x i16> @llvm.aarch64.neon.uaddlv.v1i16.v8i8(<8 x i8> %a)
- %0 = extractelement <1 x i16> %uaddlv.i, i32 0
- ret i16 %0
-}
-
-define i32 @test_vaddlv_u16(<4 x i16> %a) {
-; CHECK: test_vaddlv_u16:
-; CHECK: uaddlv s{{[0-9]+}}, {{v[0-9]+}}.4h
-entry:
- %uaddlv.i = tail call <1 x i32> @llvm.aarch64.neon.uaddlv.v1i32.v4i16(<4 x i16> %a)
- %0 = extractelement <1 x i32> %uaddlv.i, i32 0
- ret i32 %0
-}
-
-define i16 @test_vaddlvq_s8(<16 x i8> %a) {
-; CHECK: test_vaddlvq_s8:
-; CHECK: saddlv h{{[0-9]+}}, {{v[0-9]+}}.16b
-entry:
- %saddlv.i = tail call <1 x i16> @llvm.aarch64.neon.saddlv.v1i16.v16i8(<16 x i8> %a)
- %0 = extractelement <1 x i16> %saddlv.i, i32 0
- ret i16 %0
-}
-
-define i32 @test_vaddlvq_s16(<8 x i16> %a) {
-; CHECK: test_vaddlvq_s16:
-; CHECK: saddlv s{{[0-9]+}}, {{v[0-9]+}}.8h
-entry:
- %saddlv.i = tail call <1 x i32> @llvm.aarch64.neon.saddlv.v1i32.v8i16(<8 x i16> %a)
- %0 = extractelement <1 x i32> %saddlv.i, i32 0
- ret i32 %0
-}
-
-define i64 @test_vaddlvq_s32(<4 x i32> %a) {
-; CHECK: test_vaddlvq_s32:
-; CHECK: saddlv d{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
- %saddlv.i = tail call <1 x i64> @llvm.aarch64.neon.saddlv.v1i64.v4i32(<4 x i32> %a)
- %0 = extractelement <1 x i64> %saddlv.i, i32 0
- ret i64 %0
-}
-
-define i16 @test_vaddlvq_u8(<16 x i8> %a) {
-; CHECK: test_vaddlvq_u8:
-; CHECK: uaddlv h{{[0-9]+}}, {{v[0-9]+}}.16b
-entry:
- %uaddlv.i = tail call <1 x i16> @llvm.aarch64.neon.uaddlv.v1i16.v16i8(<16 x i8> %a)
- %0 = extractelement <1 x i16> %uaddlv.i, i32 0
- ret i16 %0
-}
-
-define i32 @test_vaddlvq_u16(<8 x i16> %a) {
-; CHECK: test_vaddlvq_u16:
-; CHECK: uaddlv s{{[0-9]+}}, {{v[0-9]+}}.8h
-entry:
- %uaddlv.i = tail call <1 x i32> @llvm.aarch64.neon.uaddlv.v1i32.v8i16(<8 x i16> %a)
- %0 = extractelement <1 x i32> %uaddlv.i, i32 0
- ret i32 %0
-}
-
-define i64 @test_vaddlvq_u32(<4 x i32> %a) {
-; CHECK: test_vaddlvq_u32:
-; CHECK: uaddlv d{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
- %uaddlv.i = tail call <1 x i64> @llvm.aarch64.neon.uaddlv.v1i64.v4i32(<4 x i32> %a)
- %0 = extractelement <1 x i64> %uaddlv.i, i32 0
- ret i64 %0
-}
-
-define i8 @test_vmaxv_s8(<8 x i8> %a) {
-; CHECK: test_vmaxv_s8:
-; CHECK: smaxv b{{[0-9]+}}, {{v[0-9]+}}.8b
-entry:
- %smaxv.i = tail call <1 x i8> @llvm.aarch64.neon.smaxv.v1i8.v8i8(<8 x i8> %a)
- %0 = extractelement <1 x i8> %smaxv.i, i32 0
- ret i8 %0
-}
-
-define i16 @test_vmaxv_s16(<4 x i16> %a) {
-; CHECK: test_vmaxv_s16:
-; CHECK: smaxv h{{[0-9]+}}, {{v[0-9]+}}.4h
-entry:
- %smaxv.i = tail call <1 x i16> @llvm.aarch64.neon.smaxv.v1i16.v4i16(<4 x i16> %a)
- %0 = extractelement <1 x i16> %smaxv.i, i32 0
- ret i16 %0
-}
-
-define i8 @test_vmaxv_u8(<8 x i8> %a) {
-; CHECK: test_vmaxv_u8:
-; CHECK: umaxv b{{[0-9]+}}, {{v[0-9]+}}.8b
-entry:
- %umaxv.i = tail call <1 x i8> @llvm.aarch64.neon.umaxv.v1i8.v8i8(<8 x i8> %a)
- %0 = extractelement <1 x i8> %umaxv.i, i32 0
- ret i8 %0
-}
-
-define i16 @test_vmaxv_u16(<4 x i16> %a) {
-; CHECK: test_vmaxv_u16:
-; CHECK: umaxv h{{[0-9]+}}, {{v[0-9]+}}.4h
-entry:
- %umaxv.i = tail call <1 x i16> @llvm.aarch64.neon.umaxv.v1i16.v4i16(<4 x i16> %a)
- %0 = extractelement <1 x i16> %umaxv.i, i32 0
- ret i16 %0
-}
-
-define i8 @test_vmaxvq_s8(<16 x i8> %a) {
-; CHECK: test_vmaxvq_s8:
-; CHECK: smaxv b{{[0-9]+}}, {{v[0-9]+}}.16b
-entry:
- %smaxv.i = tail call <1 x i8> @llvm.aarch64.neon.smaxv.v1i8.v16i8(<16 x i8> %a)
- %0 = extractelement <1 x i8> %smaxv.i, i32 0
- ret i8 %0
-}
-
-define i16 @test_vmaxvq_s16(<8 x i16> %a) {
-; CHECK: test_vmaxvq_s16:
-; CHECK: smaxv h{{[0-9]+}}, {{v[0-9]+}}.8h
-entry:
- %smaxv.i = tail call <1 x i16> @llvm.aarch64.neon.smaxv.v1i16.v8i16(<8 x i16> %a)
- %0 = extractelement <1 x i16> %smaxv.i, i32 0
- ret i16 %0
-}
-
-define i32 @test_vmaxvq_s32(<4 x i32> %a) {
-; CHECK: test_vmaxvq_s32:
-; CHECK: smaxv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
- %smaxv.i = tail call <1 x i32> @llvm.aarch64.neon.smaxv.v1i32.v4i32(<4 x i32> %a)
- %0 = extractelement <1 x i32> %smaxv.i, i32 0
- ret i32 %0
-}
-
-define i8 @test_vmaxvq_u8(<16 x i8> %a) {
-; CHECK: test_vmaxvq_u8:
-; CHECK: umaxv b{{[0-9]+}}, {{v[0-9]+}}.16b
-entry:
- %umaxv.i = tail call <1 x i8> @llvm.aarch64.neon.umaxv.v1i8.v16i8(<16 x i8> %a)
- %0 = extractelement <1 x i8> %umaxv.i, i32 0
- ret i8 %0
-}
-
-define i16 @test_vmaxvq_u16(<8 x i16> %a) {
-; CHECK: test_vmaxvq_u16:
-; CHECK: umaxv h{{[0-9]+}}, {{v[0-9]+}}.8h
-entry:
- %umaxv.i = tail call <1 x i16> @llvm.aarch64.neon.umaxv.v1i16.v8i16(<8 x i16> %a)
- %0 = extractelement <1 x i16> %umaxv.i, i32 0
- ret i16 %0
-}
-
-define i32 @test_vmaxvq_u32(<4 x i32> %a) {
-; CHECK: test_vmaxvq_u32:
-; CHECK: umaxv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
- %umaxv.i = tail call <1 x i32> @llvm.aarch64.neon.umaxv.v1i32.v4i32(<4 x i32> %a)
- %0 = extractelement <1 x i32> %umaxv.i, i32 0
- ret i32 %0
-}
-
-define i8 @test_vminv_s8(<8 x i8> %a) {
-; CHECK: test_vminv_s8:
-; CHECK: sminv b{{[0-9]+}}, {{v[0-9]+}}.8b
-entry:
- %sminv.i = tail call <1 x i8> @llvm.aarch64.neon.sminv.v1i8.v8i8(<8 x i8> %a)
- %0 = extractelement <1 x i8> %sminv.i, i32 0
- ret i8 %0
-}
-
-define i16 @test_vminv_s16(<4 x i16> %a) {
-; CHECK: test_vminv_s16:
-; CHECK: sminv h{{[0-9]+}}, {{v[0-9]+}}.4h
-entry:
- %sminv.i = tail call <1 x i16> @llvm.aarch64.neon.sminv.v1i16.v4i16(<4 x i16> %a)
- %0 = extractelement <1 x i16> %sminv.i, i32 0
- ret i16 %0
-}
-
-define i8 @test_vminv_u8(<8 x i8> %a) {
-; CHECK: test_vminv_u8:
-; CHECK: uminv b{{[0-9]+}}, {{v[0-9]+}}.8b
-entry:
- %uminv.i = tail call <1 x i8> @llvm.aarch64.neon.uminv.v1i8.v8i8(<8 x i8> %a)
- %0 = extractelement <1 x i8> %uminv.i, i32 0
- ret i8 %0
-}
-
-define i16 @test_vminv_u16(<4 x i16> %a) {
-; CHECK: test_vminv_u16:
-; CHECK: uminv h{{[0-9]+}}, {{v[0-9]+}}.4h
-entry:
- %uminv.i = tail call <1 x i16> @llvm.aarch64.neon.uminv.v1i16.v4i16(<4 x i16> %a)
- %0 = extractelement <1 x i16> %uminv.i, i32 0
- ret i16 %0
-}
-
-define i8 @test_vminvq_s8(<16 x i8> %a) {
-; CHECK: test_vminvq_s8:
-; CHECK: sminv b{{[0-9]+}}, {{v[0-9]+}}.16b
-entry:
- %sminv.i = tail call <1 x i8> @llvm.aarch64.neon.sminv.v1i8.v16i8(<16 x i8> %a)
- %0 = extractelement <1 x i8> %sminv.i, i32 0
- ret i8 %0
-}
-
-define i16 @test_vminvq_s16(<8 x i16> %a) {
-; CHECK: test_vminvq_s16:
-; CHECK: sminv h{{[0-9]+}}, {{v[0-9]+}}.8h
-entry:
- %sminv.i = tail call <1 x i16> @llvm.aarch64.neon.sminv.v1i16.v8i16(<8 x i16> %a)
- %0 = extractelement <1 x i16> %sminv.i, i32 0
- ret i16 %0
-}
-
-define i32 @test_vminvq_s32(<4 x i32> %a) {
-; CHECK: test_vminvq_s32:
-; CHECK: sminv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
- %sminv.i = tail call <1 x i32> @llvm.aarch64.neon.sminv.v1i32.v4i32(<4 x i32> %a)
- %0 = extractelement <1 x i32> %sminv.i, i32 0
- ret i32 %0
-}
-
-define i8 @test_vminvq_u8(<16 x i8> %a) {
-; CHECK: test_vminvq_u8:
-; CHECK: uminv b{{[0-9]+}}, {{v[0-9]+}}.16b
-entry:
- %uminv.i = tail call <1 x i8> @llvm.aarch64.neon.uminv.v1i8.v16i8(<16 x i8> %a)
- %0 = extractelement <1 x i8> %uminv.i, i32 0
- ret i8 %0
-}
-
-define i16 @test_vminvq_u16(<8 x i16> %a) {
-; CHECK: test_vminvq_u16:
-; CHECK: uminv h{{[0-9]+}}, {{v[0-9]+}}.8h
-entry:
- %uminv.i = tail call <1 x i16> @llvm.aarch64.neon.uminv.v1i16.v8i16(<8 x i16> %a)
- %0 = extractelement <1 x i16> %uminv.i, i32 0
- ret i16 %0
-}
-
-define i32 @test_vminvq_u32(<4 x i32> %a) {
-; CHECK: test_vminvq_u32:
-; CHECK: uminv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
- %uminv.i = tail call <1 x i32> @llvm.aarch64.neon.uminv.v1i32.v4i32(<4 x i32> %a)
- %0 = extractelement <1 x i32> %uminv.i, i32 0
- ret i32 %0
-}
-
-define i8 @test_vaddv_s8(<8 x i8> %a) {
-; CHECK: test_vaddv_s8:
-; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.8b
-entry:
- %vaddv.i = tail call <1 x i8> @llvm.aarch64.neon.vaddv.v1i8.v8i8(<8 x i8> %a)
- %0 = extractelement <1 x i8> %vaddv.i, i32 0
- ret i8 %0
-}
-
-define i16 @test_vaddv_s16(<4 x i16> %a) {
-; CHECK: test_vaddv_s16:
-; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.4h
-entry:
- %vaddv.i = tail call <1 x i16> @llvm.aarch64.neon.vaddv.v1i16.v4i16(<4 x i16> %a)
- %0 = extractelement <1 x i16> %vaddv.i, i32 0
- ret i16 %0
-}
-
-define i8 @test_vaddv_u8(<8 x i8> %a) {
-; CHECK: test_vaddv_u8:
-; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.8b
-entry:
- %vaddv.i = tail call <1 x i8> @llvm.aarch64.neon.vaddv.v1i8.v8i8(<8 x i8> %a)
- %0 = extractelement <1 x i8> %vaddv.i, i32 0
- ret i8 %0
-}
-
-define i16 @test_vaddv_u16(<4 x i16> %a) {
-; CHECK: test_vaddv_u16:
-; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.4h
-entry:
- %vaddv.i = tail call <1 x i16> @llvm.aarch64.neon.vaddv.v1i16.v4i16(<4 x i16> %a)
- %0 = extractelement <1 x i16> %vaddv.i, i32 0
- ret i16 %0
-}
-
-define i8 @test_vaddvq_s8(<16 x i8> %a) {
-; CHECK: test_vaddvq_s8:
-; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.16b
-entry:
- %vaddv.i = tail call <1 x i8> @llvm.aarch64.neon.vaddv.v1i8.v16i8(<16 x i8> %a)
- %0 = extractelement <1 x i8> %vaddv.i, i32 0
- ret i8 %0
-}
-
-define i16 @test_vaddvq_s16(<8 x i16> %a) {
-; CHECK: test_vaddvq_s16:
-; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.8h
-entry:
- %vaddv.i = tail call <1 x i16> @llvm.aarch64.neon.vaddv.v1i16.v8i16(<8 x i16> %a)
- %0 = extractelement <1 x i16> %vaddv.i, i32 0
- ret i16 %0
-}
-
-define i32 @test_vaddvq_s32(<4 x i32> %a) {
-; CHECK: test_vaddvq_s32:
-; CHECK: addv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
- %vaddv.i = tail call <1 x i32> @llvm.aarch64.neon.vaddv.v1i32.v4i32(<4 x i32> %a)
- %0 = extractelement <1 x i32> %vaddv.i, i32 0
- ret i32 %0
-}
-
-define i8 @test_vaddvq_u8(<16 x i8> %a) {
-; CHECK: test_vaddvq_u8:
-; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.16b
-entry:
- %vaddv.i = tail call <1 x i8> @llvm.aarch64.neon.vaddv.v1i8.v16i8(<16 x i8> %a)
- %0 = extractelement <1 x i8> %vaddv.i, i32 0
- ret i8 %0
-}
-
-define i16 @test_vaddvq_u16(<8 x i16> %a) {
-; CHECK: test_vaddvq_u16:
-; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.8h
-entry:
- %vaddv.i = tail call <1 x i16> @llvm.aarch64.neon.vaddv.v1i16.v8i16(<8 x i16> %a)
- %0 = extractelement <1 x i16> %vaddv.i, i32 0
- ret i16 %0
-}
-
-define i32 @test_vaddvq_u32(<4 x i32> %a) {
-; CHECK: test_vaddvq_u32:
-; CHECK: addv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
- %vaddv.i = tail call <1 x i32> @llvm.aarch64.neon.vaddv.v1i32.v4i32(<4 x i32> %a)
- %0 = extractelement <1 x i32> %vaddv.i, i32 0
- ret i32 %0
-}
-
-define float @test_vmaxvq_f32(<4 x float> %a) {
-; CHECK: test_vmaxvq_f32:
-; CHECK: fmaxv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
- %0 = call float @llvm.aarch64.neon.vmaxv(<4 x float> %a)
- ret float %0
-}
-
-define float @test_vminvq_f32(<4 x float> %a) {
-; CHECK: test_vminvq_f32:
-; CHECK: fminv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
- %0 = call float @llvm.aarch64.neon.vminv(<4 x float> %a)
- ret float %0
-}
-
-define float @test_vmaxnmvq_f32(<4 x float> %a) {
-; CHECK: test_vmaxnmvq_f32:
-; CHECK: fmaxnmv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
- %0 = call float @llvm.aarch64.neon.vmaxnmv(<4 x float> %a)
- ret float %0
-}
-
-define float @test_vminnmvq_f32(<4 x float> %a) {
-; CHECK: test_vminnmvq_f32:
-; CHECK: fminnmv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
- %0 = call float @llvm.aarch64.neon.vminnmv(<4 x float> %a)
- ret float %0
-}
-
diff --git a/test/CodeGen/AArch64/neon-add-pairwise.ll b/test/CodeGen/AArch64/neon-add-pairwise.ll
deleted file mode 100644
index 32d8222..0000000
--- a/test/CodeGen/AArch64/neon-add-pairwise.ll
+++ /dev/null
@@ -1,101 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-declare <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_addp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; CHECK: test_addp_v8i8:
- %tmp1 = call <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: addp v0.8b, v0.8b, v1.8b
- ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vpadd.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_addp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_addp_v16i8:
- %tmp1 = call <16 x i8> @llvm.arm.neon.vpadd.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: addp v0.16b, v0.16b, v1.16b
- ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_addp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_addp_v4i16:
- %tmp1 = call <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: addp v0.4h, v0.4h, v1.4h
- ret <4 x i16> %tmp1
-}
-
-declare <8 x i16> @llvm.arm.neon.vpadd.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_addp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_addp_v8i16:
- %tmp1 = call <8 x i16> @llvm.arm.neon.vpadd.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: addp v0.8h, v0.8h, v1.8h
- ret <8 x i16> %tmp1
-}
-
-declare <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_addp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_addp_v2i32:
- %tmp1 = call <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: addp v0.2s, v0.2s, v1.2s
- ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vpadd.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_addp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_addp_v4i32:
- %tmp1 = call <4 x i32> @llvm.arm.neon.vpadd.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: addp v0.4s, v0.4s, v1.4s
- ret <4 x i32> %tmp1
-}
-
-
-declare <2 x i64> @llvm.arm.neon.vpadd.v2i64(<2 x i64>, <2 x i64>)
-
-define <2 x i64> @test_addp_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_addp_v2i64:
- %val = call <2 x i64> @llvm.arm.neon.vpadd.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: addp v0.2d, v0.2d, v1.2d
- ret <2 x i64> %val
-}
-
-declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.arm.neon.vpadd.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.arm.neon.vpadd.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @test_faddp_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; CHECK: test_faddp_v2f32:
- %val = call <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-; CHECK: faddp v0.2s, v0.2s, v1.2s
- ret <2 x float> %val
-}
-
-define <4 x float> @test_faddp_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; CHECK: test_faddp_v4f32:
- %val = call <4 x float> @llvm.arm.neon.vpadd.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-; CHECK: faddp v0.4s, v0.4s, v1.4s
- ret <4 x float> %val
-}
-
-define <2 x double> @test_faddp_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; CHECK: test_faddp_v2f64:
- %val = call <2 x double> @llvm.arm.neon.vpadd.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-; CHECK: faddp v0.2d, v0.2d, v1.2d
- ret <2 x double> %val
-}
-
-define i32 @test_vaddv.v2i32(<2 x i32> %a) {
-; CHECK-LABEL: test_vaddv.v2i32
-; CHECK: addp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
- %1 = tail call <1 x i32> @llvm.aarch64.neon.vaddv.v1i32.v2i32(<2 x i32> %a)
- %2 = extractelement <1 x i32> %1, i32 0
- ret i32 %2
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vaddv.v1i32.v2i32(<2 x i32>) \ No newline at end of file
diff --git a/test/CodeGen/AArch64/neon-bitwise-instructions.ll b/test/CodeGen/AArch64/neon-bitwise-instructions.ll
index 7e5b693..6497856 100644
--- a/test/CodeGen/AArch64/neon-bitwise-instructions.ll
+++ b/test/CodeGen/AArch64/neon-bitwise-instructions.ll
@@ -1,45 +1,52 @@
; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
define <8 x i8> @and8xi8(<8 x i8> %a, <8 x i8> %b) {
-;CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: and8xi8:
+; CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp1 = and <8 x i8> %a, %b;
ret <8 x i8> %tmp1
}
define <16 x i8> @and16xi8(<16 x i8> %a, <16 x i8> %b) {
-;CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: and16xi8:
+; CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp1 = and <16 x i8> %a, %b;
ret <16 x i8> %tmp1
}
define <8 x i8> @orr8xi8(<8 x i8> %a, <8 x i8> %b) {
-;CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: orr8xi8:
+; CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp1 = or <8 x i8> %a, %b;
ret <8 x i8> %tmp1
}
define <16 x i8> @orr16xi8(<16 x i8> %a, <16 x i8> %b) {
-;CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: orr16xi8:
+; CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp1 = or <16 x i8> %a, %b;
ret <16 x i8> %tmp1
}
define <8 x i8> @xor8xi8(<8 x i8> %a, <8 x i8> %b) {
-;CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: xor8xi8:
+; CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp1 = xor <8 x i8> %a, %b;
ret <8 x i8> %tmp1
}
define <16 x i8> @xor16xi8(<16 x i8> %a, <16 x i8> %b) {
-;CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: xor16xi8:
+; CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp1 = xor <16 x i8> %a, %b;
ret <16 x i8> %tmp1
}
define <8 x i8> @bsl8xi8_const(<8 x i8> %a, <8 x i8> %b) {
-;CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: bsl8xi8_const:
+; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp1 = and <8 x i8> %a, < i8 -1, i8 -1, i8 0, i8 0, i8 -1, i8 -1, i8 0, i8 0 >
%tmp2 = and <8 x i8> %b, < i8 0, i8 0, i8 -1, i8 -1, i8 0, i8 0, i8 -1, i8 -1 >
%tmp3 = or <8 x i8> %tmp1, %tmp2
@@ -47,7 +54,8 @@ define <8 x i8> @bsl8xi8_const(<8 x i8> %a, <8 x i8> %b) {
}
define <16 x i8> @bsl16xi8_const(<16 x i8> %a, <16 x i8> %b) {
-;CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: bsl16xi8_const:
+; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp1 = and <16 x i8> %a, < i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0 >
%tmp2 = and <16 x i8> %b, < i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 -1 >
%tmp3 = or <16 x i8> %tmp1, %tmp2
@@ -55,398 +63,461 @@ define <16 x i8> @bsl16xi8_const(<16 x i8> %a, <16 x i8> %b) {
}
define <8 x i8> @orn8xi8(<8 x i8> %a, <8 x i8> %b) {
-;CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: orn8xi8:
+; CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp1 = xor <8 x i8> %b, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
%tmp2 = or <8 x i8> %a, %tmp1
ret <8 x i8> %tmp2
}
define <16 x i8> @orn16xi8(<16 x i8> %a, <16 x i8> %b) {
-;CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: orn16xi8:
+; CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp1 = xor <16 x i8> %b, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
%tmp2 = or <16 x i8> %a, %tmp1
ret <16 x i8> %tmp2
}
define <8 x i8> @bic8xi8(<8 x i8> %a, <8 x i8> %b) {
-;CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: bic8xi8:
+; CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp1 = xor <8 x i8> %b, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
%tmp2 = and <8 x i8> %a, %tmp1
ret <8 x i8> %tmp2
}
define <16 x i8> @bic16xi8(<16 x i8> %a, <16 x i8> %b) {
-;CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: bic16xi8:
+; CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp1 = xor <16 x i8> %b, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
%tmp2 = and <16 x i8> %a, %tmp1
ret <16 x i8> %tmp2
}
define <2 x i32> @orrimm2s_lsl0(<2 x i32> %a) {
-;CHECK: orr {{v[0-9]+}}.2s, #0xff
+; CHECK-LABEL: orrimm2s_lsl0:
+; CHECK: orr {{v[0-9]+}}.2s, #{{0xff|255}}
%tmp1 = or <2 x i32> %a, < i32 255, i32 255>
ret <2 x i32> %tmp1
}
define <2 x i32> @orrimm2s_lsl8(<2 x i32> %a) {
-;CHECK: orr {{v[0-9]+}}.2s, #0xff, lsl #8
+; CHECK-LABEL: orrimm2s_lsl8:
+; CHECK: orr {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #8
%tmp1 = or <2 x i32> %a, < i32 65280, i32 65280>
ret <2 x i32> %tmp1
}
define <2 x i32> @orrimm2s_lsl16(<2 x i32> %a) {
-;CHECK: orr {{v[0-9]+}}.2s, #0xff, lsl #16
+; CHECK-LABEL: orrimm2s_lsl16:
+; CHECK: orr {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #16
%tmp1 = or <2 x i32> %a, < i32 16711680, i32 16711680>
ret <2 x i32> %tmp1
}
define <2 x i32> @orrimm2s_lsl24(<2 x i32> %a) {
-;CHECK: orr {{v[0-9]+}}.2s, #0xff, lsl #24
+; CHECK-LABEL: orrimm2s_lsl24:
+; CHECK: orr {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #24
%tmp1 = or <2 x i32> %a, < i32 4278190080, i32 4278190080>
ret <2 x i32> %tmp1
}
define <4 x i32> @orrimm4s_lsl0(<4 x i32> %a) {
-;CHECK: orr {{v[0-9]+}}.4s, #0xff
+; CHECK-LABEL: orrimm4s_lsl0:
+; CHECK: orr {{v[0-9]+}}.4s, #{{0xff|255}}
%tmp1 = or <4 x i32> %a, < i32 255, i32 255, i32 255, i32 255>
ret <4 x i32> %tmp1
}
define <4 x i32> @orrimm4s_lsl8(<4 x i32> %a) {
-;CHECK: orr {{v[0-9]+}}.4s, #0xff, lsl #8
+; CHECK-LABEL: orrimm4s_lsl8:
+; CHECK: orr {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #8
%tmp1 = or <4 x i32> %a, < i32 65280, i32 65280, i32 65280, i32 65280>
ret <4 x i32> %tmp1
}
define <4 x i32> @orrimm4s_lsl16(<4 x i32> %a) {
-;CHECK: orr {{v[0-9]+}}.4s, #0xff, lsl #16
+; CHECK-LABEL: orrimm4s_lsl16:
+; CHECK: orr {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #16
%tmp1 = or <4 x i32> %a, < i32 16711680, i32 16711680, i32 16711680, i32 16711680>
ret <4 x i32> %tmp1
}
define <4 x i32> @orrimm4s_lsl24(<4 x i32> %a) {
-;CHECK: orr {{v[0-9]+}}.4s, #0xff, lsl #24
+; CHECK-LABEL: orrimm4s_lsl24:
+; CHECK: orr {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #24
%tmp1 = or <4 x i32> %a, < i32 4278190080, i32 4278190080, i32 4278190080, i32 4278190080>
ret <4 x i32> %tmp1
}
define <4 x i16> @orrimm4h_lsl0(<4 x i16> %a) {
-;CHECK: orr {{v[0-9]+}}.4h, #0xff
+; CHECK-LABEL: orrimm4h_lsl0:
+; CHECK: orr {{v[0-9]+}}.4h, #{{0xff|255}}
%tmp1 = or <4 x i16> %a, < i16 255, i16 255, i16 255, i16 255 >
ret <4 x i16> %tmp1
}
define <4 x i16> @orrimm4h_lsl8(<4 x i16> %a) {
-;CHECK: orr {{v[0-9]+}}.4h, #0xff, lsl #8
+; CHECK-LABEL: orrimm4h_lsl8:
+; CHECK: orr {{v[0-9]+}}.4h, #{{0xff|255}}, lsl #8
%tmp1 = or <4 x i16> %a, < i16 65280, i16 65280, i16 65280, i16 65280 >
ret <4 x i16> %tmp1
}
define <8 x i16> @orrimm8h_lsl0(<8 x i16> %a) {
-;CHECK: orr {{v[0-9]+}}.8h, #0xff
+; CHECK-LABEL: orrimm8h_lsl0:
+; CHECK: orr {{v[0-9]+}}.8h, #{{0xff|255}}
%tmp1 = or <8 x i16> %a, < i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255 >
ret <8 x i16> %tmp1
}
define <8 x i16> @orrimm8h_lsl8(<8 x i16> %a) {
-;CHECK: orr {{v[0-9]+}}.8h, #0xff, lsl #8
+; CHECK-LABEL: orrimm8h_lsl8:
+; CHECK: orr {{v[0-9]+}}.8h, #{{0xff|255}}, lsl #8
%tmp1 = or <8 x i16> %a, < i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280 >
ret <8 x i16> %tmp1
}
define <2 x i32> @bicimm2s_lsl0(<2 x i32> %a) {
-;CHECK: bic {{v[0-9]+}}.2s, #0x10
+; CHECK-LABEL: bicimm2s_lsl0:
+; CHECK: bic {{v[0-9]+}}.2s, #{{0x10|16}}
%tmp1 = and <2 x i32> %a, < i32 4294967279, i32 4294967279 >
ret <2 x i32> %tmp1
}
define <2 x i32> @bicimm2s_lsl8(<2 x i32> %a) {
-;CHECK: bic {{v[0-9]+}}.2s, #0x10, lsl #8
+; CHECK-LABEL: bicimm2s_lsl8:
+; CHECK: bic {{v[0-9]+}}.2s, #{{0x10|16}}, lsl #8
%tmp1 = and <2 x i32> %a, < i32 4294963199, i32 4294963199 >
ret <2 x i32> %tmp1
}
define <2 x i32> @bicimm2s_lsl16(<2 x i32> %a) {
-;CHECK: bic {{v[0-9]+}}.2s, #0x10, lsl #16
+; CHECK-LABEL: bicimm2s_lsl16:
+; CHECK: bic {{v[0-9]+}}.2s, #{{0x10|16}}, lsl #16
%tmp1 = and <2 x i32> %a, < i32 4293918719, i32 4293918719 >
ret <2 x i32> %tmp1
}
define <2 x i32> @bicimm2s_lsl124(<2 x i32> %a) {
-;CHECK: bic {{v[0-9]+}}.2s, #0x10, lsl #24
+; CHECK-LABEL: bicimm2s_lsl124:
+; CHECK: bic {{v[0-9]+}}.2s, #{{0x10|16}}, lsl #24
%tmp1 = and <2 x i32> %a, < i32 4026531839, i32 4026531839>
ret <2 x i32> %tmp1
}
define <4 x i32> @bicimm4s_lsl0(<4 x i32> %a) {
-;CHECK: bic {{v[0-9]+}}.4s, #0x10
+; CHECK-LABEL: bicimm4s_lsl0:
+; CHECK: bic {{v[0-9]+}}.4s, #{{0x10|16}}
%tmp1 = and <4 x i32> %a, < i32 4294967279, i32 4294967279, i32 4294967279, i32 4294967279 >
ret <4 x i32> %tmp1
}
define <4 x i32> @bicimm4s_lsl8(<4 x i32> %a) {
-;CHECK: bic {{v[0-9]+}}.4s, #0x10, lsl #8
+; CHECK-LABEL: bicimm4s_lsl8:
+; CHECK: bic {{v[0-9]+}}.4s, #{{0x10|16}}, lsl #8
%tmp1 = and <4 x i32> %a, < i32 4294963199, i32 4294963199, i32 4294963199, i32 4294963199 >
ret <4 x i32> %tmp1
}
define <4 x i32> @bicimm4s_lsl16(<4 x i32> %a) {
-;CHECK: bic {{v[0-9]+}}.4s, #0x10, lsl #16
+; CHECK-LABEL: bicimm4s_lsl16:
+; CHECK: bic {{v[0-9]+}}.4s, #{{0x10|16}}, lsl #16
%tmp1 = and <4 x i32> %a, < i32 4293918719, i32 4293918719, i32 4293918719, i32 4293918719 >
ret <4 x i32> %tmp1
}
define <4 x i32> @bicimm4s_lsl124(<4 x i32> %a) {
-;CHECK: bic {{v[0-9]+}}.4s, #0x10, lsl #24
+; CHECK-LABEL: bicimm4s_lsl124:
+; CHECK: bic {{v[0-9]+}}.4s, #{{0x10|16}}, lsl #24
%tmp1 = and <4 x i32> %a, < i32 4026531839, i32 4026531839, i32 4026531839, i32 4026531839>
ret <4 x i32> %tmp1
}
define <4 x i16> @bicimm4h_lsl0_a(<4 x i16> %a) {
-;CHECK: bic {{v[0-9]+}}.4h, #0x10
+; CHECK-LABEL: bicimm4h_lsl0_a:
+; CHECK: bic {{v[0-9]+}}.4h, #{{0x10|16}}
%tmp1 = and <4 x i16> %a, < i16 4294967279, i16 4294967279, i16 4294967279, i16 4294967279 >
ret <4 x i16> %tmp1
}
define <4 x i16> @bicimm4h_lsl0_b(<4 x i16> %a) {
-;CHECK: bic {{v[0-9]+}}.4h, #0xff
+; CHECK-LABEL: bicimm4h_lsl0_b:
+; CHECK: bic {{v[0-9]+}}.4h, #{{0xff|255}}
%tmp1 = and <4 x i16> %a, < i16 65280, i16 65280, i16 65280, i16 65280 >
ret <4 x i16> %tmp1
}
define <4 x i16> @bicimm4h_lsl8_a(<4 x i16> %a) {
-;CHECK: bic {{v[0-9]+}}.4h, #0x10, lsl #8
+; CHECK-LABEL: bicimm4h_lsl8_a:
+; CHECK: bic {{v[0-9]+}}.4h, #{{0x10|16}}, lsl #8
%tmp1 = and <4 x i16> %a, < i16 4294963199, i16 4294963199, i16 4294963199, i16 4294963199>
ret <4 x i16> %tmp1
}
define <4 x i16> @bicimm4h_lsl8_b(<4 x i16> %a) {
-;CHECK: bic {{v[0-9]+}}.4h, #0xff, lsl #8
+; CHECK-LABEL: bicimm4h_lsl8_b:
+; CHECK: bic {{v[0-9]+}}.4h, #{{0xff|255}}, lsl #8
%tmp1 = and <4 x i16> %a, < i16 255, i16 255, i16 255, i16 255>
ret <4 x i16> %tmp1
}
define <8 x i16> @bicimm8h_lsl0_a(<8 x i16> %a) {
-;CHECK: bic {{v[0-9]+}}.8h, #0x10
+; CHECK-LABEL: bicimm8h_lsl0_a:
+; CHECK: bic {{v[0-9]+}}.8h, #{{0x10|16}}
%tmp1 = and <8 x i16> %a, < i16 4294967279, i16 4294967279, i16 4294967279, i16 4294967279,
i16 4294967279, i16 4294967279, i16 4294967279, i16 4294967279 >
ret <8 x i16> %tmp1
}
define <8 x i16> @bicimm8h_lsl0_b(<8 x i16> %a) {
-;CHECK: bic {{v[0-9]+}}.8h, #0xff
+; CHECK-LABEL: bicimm8h_lsl0_b:
+; CHECK: bic {{v[0-9]+}}.8h, #{{0xff|255}}
%tmp1 = and <8 x i16> %a, < i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280 >
ret <8 x i16> %tmp1
}
define <8 x i16> @bicimm8h_lsl8_a(<8 x i16> %a) {
-;CHECK: bic {{v[0-9]+}}.8h, #0x10, lsl #8
+; CHECK-LABEL: bicimm8h_lsl8_a:
+; CHECK: bic {{v[0-9]+}}.8h, #{{0x10|16}}, lsl #8
%tmp1 = and <8 x i16> %a, < i16 4294963199, i16 4294963199, i16 4294963199, i16 4294963199,
i16 4294963199, i16 4294963199, i16 4294963199, i16 4294963199>
ret <8 x i16> %tmp1
}
define <8 x i16> @bicimm8h_lsl8_b(<8 x i16> %a) {
-;CHECK: bic {{v[0-9]+}}.8h, #0xff, lsl #8
+; CHECK-LABEL: bicimm8h_lsl8_b:
+; CHECK: bic {{v[0-9]+}}.8h, #{{0xff|255}}, lsl #8
%tmp1 = and <8 x i16> %a, < i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
ret <8 x i16> %tmp1
}
define <2 x i32> @and2xi32(<2 x i32> %a, <2 x i32> %b) {
-;CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: and2xi32:
+; CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp1 = and <2 x i32> %a, %b;
ret <2 x i32> %tmp1
}
define <4 x i16> @and4xi16(<4 x i16> %a, <4 x i16> %b) {
-;CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: and4xi16:
+; CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp1 = and <4 x i16> %a, %b;
ret <4 x i16> %tmp1
}
define <1 x i64> @and1xi64(<1 x i64> %a, <1 x i64> %b) {
-;CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: and1xi64:
+; CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp1 = and <1 x i64> %a, %b;
ret <1 x i64> %tmp1
}
define <4 x i32> @and4xi32(<4 x i32> %a, <4 x i32> %b) {
-;CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: and4xi32:
+; CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp1 = and <4 x i32> %a, %b;
ret <4 x i32> %tmp1
}
define <8 x i16> @and8xi16(<8 x i16> %a, <8 x i16> %b) {
-;CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: and8xi16:
+; CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp1 = and <8 x i16> %a, %b;
ret <8 x i16> %tmp1
}
define <2 x i64> @and2xi64(<2 x i64> %a, <2 x i64> %b) {
-;CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: and2xi64:
+; CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp1 = and <2 x i64> %a, %b;
ret <2 x i64> %tmp1
}
define <2 x i32> @orr2xi32(<2 x i32> %a, <2 x i32> %b) {
-;CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: orr2xi32:
+; CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp1 = or <2 x i32> %a, %b;
ret <2 x i32> %tmp1
}
define <4 x i16> @orr4xi16(<4 x i16> %a, <4 x i16> %b) {
-;CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: orr4xi16:
+; CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp1 = or <4 x i16> %a, %b;
ret <4 x i16> %tmp1
}
define <1 x i64> @orr1xi64(<1 x i64> %a, <1 x i64> %b) {
-;CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: orr1xi64:
+; CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp1 = or <1 x i64> %a, %b;
ret <1 x i64> %tmp1
}
define <4 x i32> @orr4xi32(<4 x i32> %a, <4 x i32> %b) {
-;CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: orr4xi32:
+; CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp1 = or <4 x i32> %a, %b;
ret <4 x i32> %tmp1
}
define <8 x i16> @orr8xi16(<8 x i16> %a, <8 x i16> %b) {
-;CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: orr8xi16:
+; CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp1 = or <8 x i16> %a, %b;
ret <8 x i16> %tmp1
}
define <2 x i64> @orr2xi64(<2 x i64> %a, <2 x i64> %b) {
-;CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: orr2xi64:
+; CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp1 = or <2 x i64> %a, %b;
ret <2 x i64> %tmp1
}
define <2 x i32> @eor2xi32(<2 x i32> %a, <2 x i32> %b) {
-;CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: eor2xi32:
+; CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp1 = xor <2 x i32> %a, %b;
ret <2 x i32> %tmp1
}
define <4 x i16> @eor4xi16(<4 x i16> %a, <4 x i16> %b) {
-;CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: eor4xi16:
+; CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp1 = xor <4 x i16> %a, %b;
ret <4 x i16> %tmp1
}
define <1 x i64> @eor1xi64(<1 x i64> %a, <1 x i64> %b) {
-;CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: eor1xi64:
+; CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp1 = xor <1 x i64> %a, %b;
ret <1 x i64> %tmp1
}
define <4 x i32> @eor4xi32(<4 x i32> %a, <4 x i32> %b) {
-;CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: eor4xi32:
+; CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp1 = xor <4 x i32> %a, %b;
ret <4 x i32> %tmp1
}
define <8 x i16> @eor8xi16(<8 x i16> %a, <8 x i16> %b) {
-;CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: eor8xi16:
+; CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp1 = xor <8 x i16> %a, %b;
ret <8 x i16> %tmp1
}
define <2 x i64> @eor2xi64(<2 x i64> %a, <2 x i64> %b) {
-;CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: eor2xi64:
+; CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp1 = xor <2 x i64> %a, %b;
ret <2 x i64> %tmp1
}
define <2 x i32> @bic2xi32(<2 x i32> %a, <2 x i32> %b) {
-;CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: bic2xi32:
+; CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp1 = xor <2 x i32> %b, < i32 -1, i32 -1 >
%tmp2 = and <2 x i32> %a, %tmp1
ret <2 x i32> %tmp2
}
define <4 x i16> @bic4xi16(<4 x i16> %a, <4 x i16> %b) {
-;CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: bic4xi16:
+; CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp1 = xor <4 x i16> %b, < i16 -1, i16 -1, i16 -1, i16-1 >
%tmp2 = and <4 x i16> %a, %tmp1
ret <4 x i16> %tmp2
}
define <1 x i64> @bic1xi64(<1 x i64> %a, <1 x i64> %b) {
-;CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: bic1xi64:
+; CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp1 = xor <1 x i64> %b, < i64 -1>
%tmp2 = and <1 x i64> %a, %tmp1
ret <1 x i64> %tmp2
}
define <4 x i32> @bic4xi32(<4 x i32> %a, <4 x i32> %b) {
-;CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: bic4xi32:
+; CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp1 = xor <4 x i32> %b, < i32 -1, i32 -1, i32 -1, i32 -1>
%tmp2 = and <4 x i32> %a, %tmp1
ret <4 x i32> %tmp2
}
define <8 x i16> @bic8xi16(<8 x i16> %a, <8 x i16> %b) {
-;CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: bic8xi16:
+; CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp1 = xor <8 x i16> %b, < i16 -1, i16 -1, i16 -1, i16-1, i16 -1, i16 -1, i16 -1, i16 -1 >
%tmp2 = and <8 x i16> %a, %tmp1
ret <8 x i16> %tmp2
}
define <2 x i64> @bic2xi64(<2 x i64> %a, <2 x i64> %b) {
-;CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: bic2xi64:
+; CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp1 = xor <2 x i64> %b, < i64 -1, i64 -1>
%tmp2 = and <2 x i64> %a, %tmp1
ret <2 x i64> %tmp2
}
define <2 x i32> @orn2xi32(<2 x i32> %a, <2 x i32> %b) {
-;CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: orn2xi32:
+; CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp1 = xor <2 x i32> %b, < i32 -1, i32 -1 >
%tmp2 = or <2 x i32> %a, %tmp1
ret <2 x i32> %tmp2
}
define <4 x i16> @orn4xi16(<4 x i16> %a, <4 x i16> %b) {
-;CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: orn4xi16:
+; CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp1 = xor <4 x i16> %b, < i16 -1, i16 -1, i16 -1, i16-1 >
%tmp2 = or <4 x i16> %a, %tmp1
ret <4 x i16> %tmp2
}
define <1 x i64> @orn1xi64(<1 x i64> %a, <1 x i64> %b) {
-;CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: orn1xi64:
+; CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp1 = xor <1 x i64> %b, < i64 -1>
%tmp2 = or <1 x i64> %a, %tmp1
ret <1 x i64> %tmp2
}
define <4 x i32> @orn4xi32(<4 x i32> %a, <4 x i32> %b) {
-;CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: orn4xi32:
+; CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp1 = xor <4 x i32> %b, < i32 -1, i32 -1, i32 -1, i32 -1>
%tmp2 = or <4 x i32> %a, %tmp1
ret <4 x i32> %tmp2
}
define <8 x i16> @orn8xi16(<8 x i16> %a, <8 x i16> %b) {
-;CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: orn8xi16:
+; CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp1 = xor <8 x i16> %b, < i16 -1, i16 -1, i16 -1, i16-1, i16 -1, i16 -1, i16 -1, i16 -1 >
%tmp2 = or <8 x i16> %a, %tmp1
ret <8 x i16> %tmp2
}
define <2 x i64> @orn2xi64(<2 x i64> %a, <2 x i64> %b) {
-;CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: orn2xi64:
+; CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp1 = xor <2 x i64> %b, < i64 -1, i64 -1>
%tmp2 = or <2 x i64> %a, %tmp1
ret <2 x i64> %tmp2
}
define <2 x i32> @bsl2xi32_const(<2 x i32> %a, <2 x i32> %b) {
-;CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: bsl2xi32_const:
+; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp1 = and <2 x i32> %a, < i32 -1, i32 0 >
%tmp2 = and <2 x i32> %b, < i32 0, i32 -1 >
%tmp3 = or <2 x i32> %tmp1, %tmp2
@@ -455,7 +526,8 @@ define <2 x i32> @bsl2xi32_const(<2 x i32> %a, <2 x i32> %b) {
define <4 x i16> @bsl4xi16_const(<4 x i16> %a, <4 x i16> %b) {
-;CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: bsl4xi16_const:
+; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp1 = and <4 x i16> %a, < i16 -1, i16 0, i16 -1,i16 0 >
%tmp2 = and <4 x i16> %b, < i16 0, i16 -1,i16 0, i16 -1 >
%tmp3 = or <4 x i16> %tmp1, %tmp2
@@ -463,7 +535,8 @@ define <4 x i16> @bsl4xi16_const(<4 x i16> %a, <4 x i16> %b) {
}
define <1 x i64> @bsl1xi64_const(<1 x i64> %a, <1 x i64> %b) {
-;CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: bsl1xi64_const:
+; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp1 = and <1 x i64> %a, < i64 -16 >
%tmp2 = and <1 x i64> %b, < i64 15 >
%tmp3 = or <1 x i64> %tmp1, %tmp2
@@ -471,7 +544,8 @@ define <1 x i64> @bsl1xi64_const(<1 x i64> %a, <1 x i64> %b) {
}
define <4 x i32> @bsl4xi32_const(<4 x i32> %a, <4 x i32> %b) {
-;CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: bsl4xi32_const:
+; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp1 = and <4 x i32> %a, < i32 -1, i32 0, i32 -1, i32 0 >
%tmp2 = and <4 x i32> %b, < i32 0, i32 -1, i32 0, i32 -1 >
%tmp3 = or <4 x i32> %tmp1, %tmp2
@@ -479,7 +553,8 @@ define <4 x i32> @bsl4xi32_const(<4 x i32> %a, <4 x i32> %b) {
}
define <8 x i16> @bsl8xi16_const(<8 x i16> %a, <8 x i16> %b) {
-;CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: bsl8xi16_const:
+; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp1 = and <8 x i16> %a, < i16 -1, i16 -1, i16 0,i16 0, i16 -1, i16 -1, i16 0,i16 0 >
%tmp2 = and <8 x i16> %b, < i16 0, i16 0, i16 -1, i16 -1, i16 0, i16 0, i16 -1, i16 -1 >
%tmp3 = or <8 x i16> %tmp1, %tmp2
@@ -487,7 +562,8 @@ define <8 x i16> @bsl8xi16_const(<8 x i16> %a, <8 x i16> %b) {
}
define <2 x i64> @bsl2xi64_const(<2 x i64> %a, <2 x i64> %b) {
-;CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: bsl2xi64_const:
+; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp1 = and <2 x i64> %a, < i64 -1, i64 0 >
%tmp2 = and <2 x i64> %b, < i64 0, i64 -1 >
%tmp3 = or <2 x i64> %tmp1, %tmp2
@@ -496,7 +572,8 @@ define <2 x i64> @bsl2xi64_const(<2 x i64> %a, <2 x i64> %b) {
define <8 x i8> @bsl8xi8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) {
-;CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: bsl8xi8:
+; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%1 = and <8 x i8> %v1, %v2
%2 = xor <8 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
%3 = and <8 x i8> %2, %v3
@@ -505,7 +582,8 @@ define <8 x i8> @bsl8xi8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) {
}
define <4 x i16> @bsl4xi16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) {
-;CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: bsl4xi16:
+; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%1 = and <4 x i16> %v1, %v2
%2 = xor <4 x i16> %v1, <i16 -1, i16 -1, i16 -1, i16 -1>
%3 = and <4 x i16> %2, %v3
@@ -514,7 +592,8 @@ define <4 x i16> @bsl4xi16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) {
}
define <2 x i32> @bsl2xi32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) {
-;CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: bsl2xi32:
+; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%1 = and <2 x i32> %v1, %v2
%2 = xor <2 x i32> %v1, <i32 -1, i32 -1>
%3 = and <2 x i32> %2, %v3
@@ -523,7 +602,8 @@ define <2 x i32> @bsl2xi32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) {
}
define <1 x i64> @bsl1xi64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) {
-;CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: bsl1xi64:
+; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%1 = and <1 x i64> %v1, %v2
%2 = xor <1 x i64> %v1, <i64 -1>
%3 = and <1 x i64> %2, %v3
@@ -532,7 +612,8 @@ define <1 x i64> @bsl1xi64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) {
}
define <16 x i8> @bsl16xi8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) {
-;CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: bsl16xi8:
+; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%1 = and <16 x i8> %v1, %v2
%2 = xor <16 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
%3 = and <16 x i8> %2, %v3
@@ -541,7 +622,8 @@ define <16 x i8> @bsl16xi8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) {
}
define <8 x i16> @bsl8xi16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) {
-;CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: bsl8xi16:
+; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%1 = and <8 x i16> %v1, %v2
%2 = xor <8 x i16> %v1, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
%3 = and <8 x i16> %2, %v3
@@ -550,7 +632,8 @@ define <8 x i16> @bsl8xi16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) {
}
define <4 x i32> @bsl4xi32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
-;CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: bsl4xi32:
+; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%1 = and <4 x i32> %v1, %v2
%2 = xor <4 x i32> %v1, <i32 -1, i32 -1, i32 -1, i32 -1>
%3 = and <4 x i32> %2, %v3
@@ -559,56 +642,63 @@ define <4 x i32> @bsl4xi32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
}
define <8 x i8> @vselect_v8i8(<8 x i8> %a) {
-;CHECK: movi {{d[0-9]+}}, #0xffff
-;CHECK-NEXT: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: vselect_v8i8:
+; CHECK: movi {{d[0-9]+}}, #0x{{0*}}ffff
+; CHECK-NEXT: {{bsl v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b|and v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b}}
%b = select <8 x i1> <i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <8 x i8> %a, <8 x i8> <i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
ret <8 x i8> %b
}
define <4 x i16> @vselect_v4i16(<4 x i16> %a) {
-;CHECK: movi {{d[0-9]+}}, #0xffff
-;CHECK-NEXT: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: vselect_v4i16:
+; CHECK: movi {{d[0-9]+}}, #0x{{0*}}ffff
+; CHECK-NEXT: {{bsl v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b|and v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b}}
%b = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i16> %a, <4 x i16> <i16 undef, i16 0, i16 0, i16 0>
ret <4 x i16> %b
}
define <8 x i8> @vselect_cmp_ne(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
-;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-;CHECK-NEXT: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: vselect_cmp_ne:
+; CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-NEXT: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%cmp = icmp ne <8 x i8> %a, %b
%d = select <8 x i1> %cmp, <8 x i8> %b, <8 x i8> %c
ret <8 x i8> %d
}
define <8 x i8> @vselect_cmp_eq(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
-;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-;CHECK-NEXT: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: vselect_cmp_eq:
+; CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-NEXT: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%cmp = icmp eq <8 x i8> %a, %b
%d = select <8 x i1> %cmp, <8 x i8> %b, <8 x i8> %c
ret <8 x i8> %d
}
define <8 x i8> @vselect_cmpz_ne(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
-;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-;CHECK-NEXT: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: vselect_cmpz_ne:
+; CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-NEXT: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%cmp = icmp ne <8 x i8> %a, zeroinitializer
%d = select <8 x i1> %cmp, <8 x i8> %b, <8 x i8> %c
ret <8 x i8> %d
}
define <8 x i8> @vselect_cmpz_eq(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
-;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
-;CHECK-NEXT: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: vselect_cmpz_eq:
+; CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
+; CHECK-NEXT: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%cmp = icmp eq <8 x i8> %a, zeroinitializer
%d = select <8 x i1> %cmp, <8 x i8> %b, <8 x i8> %c
ret <8 x i8> %d
}
define <8 x i8> @vselect_tst(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
-;CHECK: cmtst {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-;CHECK-NEXT: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: vselect_tst:
+; CHECK: cmtst {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-NEXT: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp3 = and <8 x i8> %a, %b
%tmp4 = icmp ne <8 x i8> %tmp3, zeroinitializer
%d = select <8 x i1> %tmp4, <8 x i8> %b, <8 x i8> %c
@@ -616,7 +706,8 @@ define <8 x i8> @vselect_tst(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
}
define <2 x i64> @bsl2xi64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3) {
-;CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: bsl2xi64:
+; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%1 = and <2 x i64> %v1, %v2
%2 = xor <2 x i64> %v1, <i64 -1, i64 -1>
%3 = and <2 x i64> %2, %v3
@@ -625,458 +716,534 @@ define <2 x i64> @bsl2xi64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3) {
}
define <8 x i8> @orrimm8b_as_orrimm4h_lsl0(<8 x i8> %a) {
-;CHECK: orr {{v[0-9]+}}.4h, #0xff
+; CHECK-LABEL: orrimm8b_as_orrimm4h_lsl0:
+; CHECK: orr {{v[0-9]+}}.4h, #{{0xff|255}}
%val = or <8 x i8> %a, <i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0>
ret <8 x i8> %val
}
define <8 x i8> @orrimm8b_as_orimm4h_lsl8(<8 x i8> %a) {
-;CHECK: orr {{v[0-9]+}}.4h, #0xff, lsl #8
+; CHECK-LABEL: orrimm8b_as_orimm4h_lsl8:
+; CHECK: orr {{v[0-9]+}}.4h, #{{0xff|255}}, lsl #8
%val = or <8 x i8> %a, <i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255>
ret <8 x i8> %val
}
define <16 x i8> @orimm16b_as_orrimm8h_lsl0(<16 x i8> %a) {
-;CHECK: orr {{v[0-9]+}}.8h, #0xff
+; CHECK-LABEL: orimm16b_as_orrimm8h_lsl0:
+; CHECK: orr {{v[0-9]+}}.8h, #{{0xff|255}}
%val = or <16 x i8> %a, <i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0>
ret <16 x i8> %val
}
define <16 x i8> @orimm16b_as_orrimm8h_lsl8(<16 x i8> %a) {
-;CHECK: orr {{v[0-9]+}}.8h, #0xff, lsl #8
+; CHECK-LABEL: orimm16b_as_orrimm8h_lsl8:
+; CHECK: orr {{v[0-9]+}}.8h, #{{0xff|255}}, lsl #8
%val = or <16 x i8> %a, <i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255>
ret <16 x i8> %val
}
define <8 x i8> @and8imm2s_lsl0(<8 x i8> %a) {
-;CHECK: bic {{v[0-9]+}}.2s, #0xff
+; CHECK-LABEL: and8imm2s_lsl0:
+; CHECK: bic {{v[0-9]+}}.2s, #{{0xff|255}}
%tmp1 = and <8 x i8> %a, < i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255, i8 255>
ret <8 x i8> %tmp1
}
define <8 x i8> @and8imm2s_lsl8(<8 x i8> %a) {
-;CHECK: bic {{v[0-9]+}}.2s, #0xff, lsl #8
+; CHECK-LABEL: and8imm2s_lsl8:
+; CHECK: bic {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #8
%tmp1 = and <8 x i8> %a, < i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255>
ret <8 x i8> %tmp1
}
define <8 x i8> @and8imm2s_lsl16(<8 x i8> %a) {
-;CHECK: bic {{v[0-9]+}}.2s, #0xff, lsl #16
+; CHECK-LABEL: and8imm2s_lsl16:
+; CHECK: bic {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #16
%tmp1 = and <8 x i8> %a, < i8 255, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255>
ret <8 x i8> %tmp1
}
define <8 x i8> @and8imm2s_lsl24(<8 x i8> %a) {
-;CHECK: bic {{v[0-9]+}}.2s, #0xfe, lsl #24
+; CHECK-LABEL: and8imm2s_lsl24:
+; CHECK: bic {{v[0-9]+}}.2s, #{{0xfe|254}}, lsl #24
%tmp1 = and <8 x i8> %a, < i8 255, i8 255, i8 255, i8 1, i8 255, i8 255, i8 255, i8 1>
ret <8 x i8> %tmp1
}
define <4 x i16> @and16imm2s_lsl0(<4 x i16> %a) {
-;CHECK: bic {{v[0-9]+}}.2s, #0xff
+; CHECK-LABEL: and16imm2s_lsl0:
+; CHECK: bic {{v[0-9]+}}.2s, #{{0xff|255}}
%tmp1 = and <4 x i16> %a, < i16 65280, i16 65535, i16 65280, i16 65535>
ret <4 x i16> %tmp1
}
define <4 x i16> @and16imm2s_lsl8(<4 x i16> %a) {
-;CHECK: bic {{v[0-9]+}}.2s, #0xff, lsl #8
+; CHECK-LABEL: and16imm2s_lsl8:
+; CHECK: bic {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #8
%tmp1 = and <4 x i16> %a, < i16 255, i16 65535, i16 255, i16 65535>
ret <4 x i16> %tmp1
}
define <4 x i16> @and16imm2s_lsl16(<4 x i16> %a) {
-;CHECK: bic {{v[0-9]+}}.2s, #0xff, lsl #16
+; CHECK-LABEL: and16imm2s_lsl16:
+; CHECK: bic {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #16
%tmp1 = and <4 x i16> %a, < i16 65535, i16 65280, i16 65535, i16 65280>
ret <4 x i16> %tmp1
}
define <4 x i16> @and16imm2s_lsl24(<4 x i16> %a) {
-;CHECK: bic {{v[0-9]+}}.2s, #0xfe, lsl #24
+; CHECK-LABEL: and16imm2s_lsl24:
+; CHECK: bic {{v[0-9]+}}.2s, #{{0xfe|254}}, lsl #24
%tmp1 = and <4 x i16> %a, < i16 65535, i16 511, i16 65535, i16 511>
ret <4 x i16> %tmp1
}
define <1 x i64> @and64imm2s_lsl0(<1 x i64> %a) {
-;CHECK: bic {{v[0-9]+}}.2s, #0xff
+; CHECK-LABEL: and64imm2s_lsl0:
+; CHECK: bic {{v[0-9]+}}.2s, #{{0xff|255}}
%tmp1 = and <1 x i64> %a, < i64 -1095216660736>
ret <1 x i64> %tmp1
}
define <1 x i64> @and64imm2s_lsl8(<1 x i64> %a) {
-;CHECK: bic {{v[0-9]+}}.2s, #0xff, lsl #8
+; CHECK-LABEL: and64imm2s_lsl8:
+; CHECK: bic {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #8
%tmp1 = and <1 x i64> %a, < i64 -280375465148161>
ret <1 x i64> %tmp1
}
define <1 x i64> @and64imm2s_lsl16(<1 x i64> %a) {
-;CHECK: bic {{v[0-9]+}}.2s, #0xff, lsl #16
+; CHECK-LABEL: and64imm2s_lsl16:
+; CHECK: bic {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #16
%tmp1 = and <1 x i64> %a, < i64 -71776119077928961>
ret <1 x i64> %tmp1
}
define <1 x i64> @and64imm2s_lsl24(<1 x i64> %a) {
-;CHECK: bic {{v[0-9]+}}.2s, #0xfe, lsl #24
+; CHECK-LABEL: and64imm2s_lsl24:
+; CHECK: bic {{v[0-9]+}}.2s, #{{0xfe|254}}, lsl #24
%tmp1 = and <1 x i64> %a, < i64 144115183814443007>
ret <1 x i64> %tmp1
}
define <16 x i8> @and8imm4s_lsl0(<16 x i8> %a) {
-;CHECK: bic {{v[0-9]+}}.4s, #0xff
+; CHECK-LABEL: and8imm4s_lsl0:
+; CHECK: bic {{v[0-9]+}}.4s, #{{0xff|255}}
%tmp1 = and <16 x i8> %a, < i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255, i8 255>
ret <16 x i8> %tmp1
}
define <16 x i8> @and8imm4s_lsl8(<16 x i8> %a) {
-;CHECK: bic {{v[0-9]+}}.4s, #0xff, lsl #8
+; CHECK-LABEL: and8imm4s_lsl8:
+; CHECK: bic {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #8
%tmp1 = and <16 x i8> %a, < i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255>
ret <16 x i8> %tmp1
}
define <16 x i8> @and8imm4s_lsl16(<16 x i8> %a) {
-;CHECK: bic {{v[0-9]+}}.4s, #0xff, lsl #16
+; CHECK-LABEL: and8imm4s_lsl16:
+; CHECK: bic {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #16
%tmp1 = and <16 x i8> %a, < i8 255, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255, i8 255, i8 255, i8 0, i8 255>
ret <16 x i8> %tmp1
}
define <16 x i8> @and8imm4s_lsl24(<16 x i8> %a) {
-;CHECK: bic {{v[0-9]+}}.4s, #0xfe, lsl #24
+; CHECK-LABEL: and8imm4s_lsl24:
+; CHECK: bic {{v[0-9]+}}.4s, #{{0xfe|254}}, lsl #24
%tmp1 = and <16 x i8> %a, < i8 255, i8 255, i8 255, i8 1, i8 255, i8 255, i8 255, i8 1, i8 255, i8 255, i8 255, i8 1, i8 255, i8 255, i8 255, i8 1>
ret <16 x i8> %tmp1
}
define <8 x i16> @and16imm4s_lsl0(<8 x i16> %a) {
-;CHECK: bic {{v[0-9]+}}.4s, #0xff
+; CHECK-LABEL: and16imm4s_lsl0:
+; CHECK: bic {{v[0-9]+}}.4s, #{{0xff|255}}
%tmp1 = and <8 x i16> %a, < i16 65280, i16 65535, i16 65280, i16 65535, i16 65280, i16 65535, i16 65280, i16 65535>
ret <8 x i16> %tmp1
}
define <8 x i16> @and16imm4s_lsl8(<8 x i16> %a) {
-;CHECK: bic {{v[0-9]+}}.4s, #0xff, lsl #8
+; CHECK-LABEL: and16imm4s_lsl8:
+; CHECK: bic {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #8
%tmp1 = and <8 x i16> %a, < i16 255, i16 65535, i16 255, i16 65535, i16 255, i16 65535, i16 255, i16 65535>
ret <8 x i16> %tmp1
}
define <8 x i16> @and16imm4s_lsl16(<8 x i16> %a) {
-;CHECK: bic {{v[0-9]+}}.4s, #0xff, lsl #16
+; CHECK-LABEL: and16imm4s_lsl16:
+; CHECK: bic {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #16
%tmp1 = and <8 x i16> %a, < i16 65535, i16 65280, i16 65535, i16 65280, i16 65535, i16 65280, i16 65535, i16 65280>
ret <8 x i16> %tmp1
}
define <8 x i16> @and16imm4s_lsl24(<8 x i16> %a) {
-;CHECK: bic {{v[0-9]+}}.4s, #0xfe, lsl #24
+; CHECK-LABEL: and16imm4s_lsl24:
+; CHECK: bic {{v[0-9]+}}.4s, #{{0xfe|254}}, lsl #24
%tmp1 = and <8 x i16> %a, < i16 65535, i16 511, i16 65535, i16 511, i16 65535, i16 511, i16 65535, i16 511>
ret <8 x i16> %tmp1
}
define <2 x i64> @and64imm4s_lsl0(<2 x i64> %a) {
-;CHECK: bic {{v[0-9]+}}.4s, #0xff
+; CHECK-LABEL: and64imm4s_lsl0:
+; CHECK: bic {{v[0-9]+}}.4s, #{{0xff|255}}
%tmp1 = and <2 x i64> %a, < i64 -1095216660736, i64 -1095216660736>
ret <2 x i64> %tmp1
}
define <2 x i64> @and64imm4s_lsl8(<2 x i64> %a) {
-;CHECK: bic {{v[0-9]+}}.4s, #0xff, lsl #8
+; CHECK-LABEL: and64imm4s_lsl8:
+; CHECK: bic {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #8
%tmp1 = and <2 x i64> %a, < i64 -280375465148161, i64 -280375465148161>
ret <2 x i64> %tmp1
}
define <2 x i64> @and64imm4s_lsl16(<2 x i64> %a) {
-;CHECK: bic {{v[0-9]+}}.4s, #0xff, lsl #16
+; CHECK-LABEL: and64imm4s_lsl16:
+; CHECK: bic {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #16
%tmp1 = and <2 x i64> %a, < i64 -71776119077928961, i64 -71776119077928961>
ret <2 x i64> %tmp1
}
define <2 x i64> @and64imm4s_lsl24(<2 x i64> %a) {
-;CHECK: bic {{v[0-9]+}}.4s, #0xfe, lsl #24
+; CHECK-LABEL: and64imm4s_lsl24:
+; CHECK: bic {{v[0-9]+}}.4s, #{{0xfe|254}}, lsl #24
%tmp1 = and <2 x i64> %a, < i64 144115183814443007, i64 144115183814443007>
ret <2 x i64> %tmp1
}
define <8 x i8> @and8imm4h_lsl0(<8 x i8> %a) {
-;CHECK: bic {{v[0-9]+}}.4h, #0xff
+; CHECK-LABEL: and8imm4h_lsl0:
+; CHECK: bic {{v[0-9]+}}.4h, #{{0xff|255}}
%tmp1 = and <8 x i8> %a, < i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255>
ret <8 x i8> %tmp1
}
define <8 x i8> @and8imm4h_lsl8(<8 x i8> %a) {
-;CHECK: bic {{v[0-9]+}}.4h, #0xff, lsl #8
+; CHECK-LABEL: and8imm4h_lsl8:
+; CHECK: bic {{v[0-9]+}}.4h, #{{0xff|255}}, lsl #8
%tmp1 = and <8 x i8> %a, < i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0>
ret <8 x i8> %tmp1
}
define <2 x i32> @and16imm4h_lsl0(<2 x i32> %a) {
-;CHECK: bic {{v[0-9]+}}.4h, #0xff
+; CHECK-LABEL: and16imm4h_lsl0:
+; CHECK: bic {{v[0-9]+}}.4h, #{{0xff|255}}
%tmp1 = and <2 x i32> %a, < i32 4278255360, i32 4278255360>
ret <2 x i32> %tmp1
}
define <2 x i32> @and16imm4h_lsl8(<2 x i32> %a) {
-;CHECK: bic {{v[0-9]+}}.4h, #0xff, lsl #8
+; CHECK-LABEL: and16imm4h_lsl8:
+; CHECK: bic {{v[0-9]+}}.4h, #{{0xff|255}}, lsl #8
%tmp1 = and <2 x i32> %a, < i32 16711935, i32 16711935>
ret <2 x i32> %tmp1
}
define <1 x i64> @and64imm4h_lsl0(<1 x i64> %a) {
-;CHECK: bic {{v[0-9]+}}.4h, #0xff
+; CHECK-LABEL: and64imm4h_lsl0:
+; CHECK: bic {{v[0-9]+}}.4h, #{{0xff|255}}
%tmp1 = and <1 x i64> %a, < i64 -71777214294589696>
ret <1 x i64> %tmp1
}
define <1 x i64> @and64imm4h_lsl8(<1 x i64> %a) {
-;CHECK: bic {{v[0-9]+}}.4h, #0xff, lsl #8
+; CHECK-LABEL: and64imm4h_lsl8:
+; CHECK: bic {{v[0-9]+}}.4h, #{{0xff|255}}, lsl #8
%tmp1 = and <1 x i64> %a, < i64 71777214294589695>
ret <1 x i64> %tmp1
}
define <16 x i8> @and8imm8h_lsl0(<16 x i8> %a) {
-;CHECK: bic {{v[0-9]+}}.8h, #0xff
+; CHECK-LABEL: and8imm8h_lsl0:
+; CHECK: bic {{v[0-9]+}}.8h, #{{0xff|255}}
%tmp1 = and <16 x i8> %a, < i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255 >
ret <16 x i8> %tmp1
}
define <16 x i8> @and8imm8h_lsl8(<16 x i8> %a) {
-;CHECK: bic {{v[0-9]+}}.8h, #0xff, lsl #8
+; CHECK-LABEL: and8imm8h_lsl8:
+; CHECK: bic {{v[0-9]+}}.8h, #{{0xff|255}}, lsl #8
%tmp1 = and <16 x i8> %a, <i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0 >
ret <16 x i8> %tmp1
}
define <4 x i32> @and16imm8h_lsl0(<4 x i32> %a) {
-;CHECK: bic {{v[0-9]+}}.8h, #0xff
+; CHECK-LABEL: and16imm8h_lsl0:
+; CHECK: bic {{v[0-9]+}}.8h, #{{0xff|255}}
%tmp1 = and <4 x i32> %a, < i32 4278255360, i32 4278255360, i32 4278255360, i32 4278255360>
ret <4 x i32> %tmp1
}
define <4 x i32> @and16imm8h_lsl8(<4 x i32> %a) {
-;CHECK: bic {{v[0-9]+}}.8h, #0xff, lsl #8
+; CHECK-LABEL: and16imm8h_lsl8:
+; CHECK: bic {{v[0-9]+}}.8h, #{{0xff|255}}, lsl #8
%tmp1 = and <4 x i32> %a, < i32 16711935, i32 16711935, i32 16711935, i32 16711935>
ret <4 x i32> %tmp1
}
define <2 x i64> @and64imm8h_lsl0(<2 x i64> %a) {
-;CHECK: bic {{v[0-9]+}}.8h, #0xff
+; CHECK-LABEL: and64imm8h_lsl0:
+; CHECK: bic {{v[0-9]+}}.8h, #{{0xff|255}}
%tmp1 = and <2 x i64> %a, < i64 -71777214294589696, i64 -71777214294589696>
ret <2 x i64> %tmp1
}
define <2 x i64> @and64imm8h_lsl8(<2 x i64> %a) {
-;CHECK: bic {{v[0-9]+}}.8h, #0xff, lsl #8
+; CHECK-LABEL: and64imm8h_lsl8:
+; CHECK: bic {{v[0-9]+}}.8h, #{{0xff|255}}, lsl #8
%tmp1 = and <2 x i64> %a, < i64 71777214294589695, i64 71777214294589695>
ret <2 x i64> %tmp1
}
define <8 x i8> @orr8imm2s_lsl0(<8 x i8> %a) {
-;CHECK: orr {{v[0-9]+}}.2s, #0xff
+; CHECK-LABEL: orr8imm2s_lsl0:
+; CHECK: orr {{v[0-9]+}}.2s, #{{0xff|255}}
%tmp1 = or <8 x i8> %a, < i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0>
ret <8 x i8> %tmp1
}
define <8 x i8> @orr8imm2s_lsl8(<8 x i8> %a) {
-;CHECK: orr {{v[0-9]+}}.2s, #0xff, lsl #8
+; CHECK-LABEL: orr8imm2s_lsl8:
+; CHECK: orr {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #8
%tmp1 = or <8 x i8> %a, < i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0>
ret <8 x i8> %tmp1
}
define <8 x i8> @orr8imm2s_lsl16(<8 x i8> %a) {
-;CHECK: orr {{v[0-9]+}}.2s, #0xff, lsl #16
+; CHECK-LABEL: orr8imm2s_lsl16:
+; CHECK: orr {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #16
%tmp1 = or <8 x i8> %a, < i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0>
ret <8 x i8> %tmp1
}
define <8 x i8> @orr8imm2s_lsl24(<8 x i8> %a) {
-;CHECK: orr {{v[0-9]+}}.2s, #0xff, lsl #24
+; CHECK-LABEL: orr8imm2s_lsl24:
+; CHECK: orr {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #24
%tmp1 = or <8 x i8> %a, < i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255>
ret <8 x i8> %tmp1
}
define <4 x i16> @orr16imm2s_lsl0(<4 x i16> %a) {
-;CHECK: orr {{v[0-9]+}}.2s, #0xff
+; CHECK-LABEL: orr16imm2s_lsl0:
+; CHECK: orr {{v[0-9]+}}.2s, #{{0xff|255}}
%tmp1 = or <4 x i16> %a, < i16 255, i16 0, i16 255, i16 0>
ret <4 x i16> %tmp1
}
define <4 x i16> @orr16imm2s_lsl8(<4 x i16> %a) {
-;CHECK: orr {{v[0-9]+}}.2s, #0xff, lsl #8
+; CHECK-LABEL: orr16imm2s_lsl8:
+; CHECK: orr {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #8
%tmp1 = or <4 x i16> %a, < i16 65280, i16 0, i16 65280, i16 0>
ret <4 x i16> %tmp1
}
define <4 x i16> @orr16imm2s_lsl16(<4 x i16> %a) {
-;CHECK: orr {{v[0-9]+}}.2s, #0xff, lsl #16
+; CHECK-LABEL: orr16imm2s_lsl16:
+; CHECK: orr {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #16
%tmp1 = or <4 x i16> %a, < i16 0, i16 255, i16 0, i16 255>
ret <4 x i16> %tmp1
}
define <4 x i16> @orr16imm2s_lsl24(<4 x i16> %a) {
-;CHECK: orr {{v[0-9]+}}.2s, #0xff, lsl #24
+; CHECK-LABEL: orr16imm2s_lsl24:
+; CHECK: orr {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #24
%tmp1 = or <4 x i16> %a, < i16 0, i16 65280, i16 0, i16 65280>
ret <4 x i16> %tmp1
}
define <1 x i64> @orr64imm2s_lsl0(<1 x i64> %a) {
-;CHECK: orr {{v[0-9]+}}.2s, #0xff
+; CHECK-LABEL: orr64imm2s_lsl0:
+; CHECK: orr {{v[0-9]+}}.2s, #{{0xff|255}}
%tmp1 = or <1 x i64> %a, < i64 1095216660735>
ret <1 x i64> %tmp1
}
define <1 x i64> @orr64imm2s_lsl8(<1 x i64> %a) {
-;CHECK: orr {{v[0-9]+}}.2s, #0xff, lsl #8
+; CHECK-LABEL: orr64imm2s_lsl8:
+; CHECK: orr {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #8
%tmp1 = or <1 x i64> %a, < i64 280375465148160>
ret <1 x i64> %tmp1
}
define <1 x i64> @orr64imm2s_lsl16(<1 x i64> %a) {
-;CHECK: orr {{v[0-9]+}}.2s, #0xff, lsl #16
+; CHECK-LABEL: orr64imm2s_lsl16:
+; CHECK: orr {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #16
%tmp1 = or <1 x i64> %a, < i64 71776119077928960>
ret <1 x i64> %tmp1
}
define <1 x i64> @orr64imm2s_lsl24(<1 x i64> %a) {
-;CHECK: orr {{v[0-9]+}}.2s, #0xff, lsl #24
+; CHECK-LABEL: orr64imm2s_lsl24:
+; CHECK: orr {{v[0-9]+}}.2s, #{{0xff|255}}, lsl #24
%tmp1 = or <1 x i64> %a, < i64 -72057589759737856>
ret <1 x i64> %tmp1
}
define <16 x i8> @orr8imm4s_lsl0(<16 x i8> %a) {
-;CHECK: orr {{v[0-9]+}}.4s, #0xff
+; CHECK-LABEL: orr8imm4s_lsl0:
+; CHECK: orr {{v[0-9]+}}.4s, #{{0xff|255}}
%tmp1 = or <16 x i8> %a, < i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0>
ret <16 x i8> %tmp1
}
define <16 x i8> @orr8imm4s_lsl8(<16 x i8> %a) {
-;CHECK: orr {{v[0-9]+}}.4s, #0xff, lsl #8
+; CHECK-LABEL: orr8imm4s_lsl8:
+; CHECK: orr {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #8
%tmp1 = or <16 x i8> %a, < i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0>
ret <16 x i8> %tmp1
}
define <16 x i8> @orr8imm4s_lsl16(<16 x i8> %a) {
-;CHECK: orr {{v[0-9]+}}.4s, #0xff, lsl #16
+; CHECK-LABEL: orr8imm4s_lsl16:
+; CHECK: orr {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #16
%tmp1 = or <16 x i8> %a, < i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0>
ret <16 x i8> %tmp1
}
define <16 x i8> @orr8imm4s_lsl24(<16 x i8> %a) {
-;CHECK: orr {{v[0-9]+}}.4s, #0xff, lsl #24
+; CHECK-LABEL: orr8imm4s_lsl24:
+; CHECK: orr {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #24
%tmp1 = or <16 x i8> %a, < i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255, i8 0, i8 0, i8 0, i8 255>
ret <16 x i8> %tmp1
}
define <8 x i16> @orr16imm4s_lsl0(<8 x i16> %a) {
-;CHECK: orr {{v[0-9]+}}.4s, #0xff
+; CHECK-LABEL: orr16imm4s_lsl0:
+; CHECK: orr {{v[0-9]+}}.4s, #{{0xff|255}}
%tmp1 = or <8 x i16> %a, < i16 255, i16 0, i16 255, i16 0, i16 255, i16 0, i16 255, i16 0>
ret <8 x i16> %tmp1
}
define <8 x i16> @orr16imm4s_lsl8(<8 x i16> %a) {
-;CHECK: orr {{v[0-9]+}}.4s, #0xff, lsl #8
+; CHECK-LABEL: orr16imm4s_lsl8:
+; CHECK: orr {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #8
%tmp1 = or <8 x i16> %a, < i16 65280, i16 0, i16 65280, i16 0, i16 65280, i16 0, i16 65280, i16 0>
ret <8 x i16> %tmp1
}
define <8 x i16> @orr16imm4s_lsl16(<8 x i16> %a) {
-;CHECK: orr {{v[0-9]+}}.4s, #0xff, lsl #16
+; CHECK-LABEL: orr16imm4s_lsl16:
+; CHECK: orr {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #16
%tmp1 = or <8 x i16> %a, < i16 0, i16 255, i16 0, i16 255, i16 0, i16 255, i16 0, i16 255>
ret <8 x i16> %tmp1
}
define <8 x i16> @orr16imm4s_lsl24(<8 x i16> %a) {
-;CHECK: orr {{v[0-9]+}}.4s, #0xff, lsl #24
+; CHECK-LABEL: orr16imm4s_lsl24:
+; CHECK: orr {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #24
%tmp1 = or <8 x i16> %a, < i16 0, i16 65280, i16 0, i16 65280, i16 0, i16 65280, i16 0, i16 65280>
ret <8 x i16> %tmp1
}
define <2 x i64> @orr64imm4s_lsl0(<2 x i64> %a) {
-;CHECK: orr {{v[0-9]+}}.4s, #0xff
+; CHECK-LABEL: orr64imm4s_lsl0:
+; CHECK: orr {{v[0-9]+}}.4s, #{{0xff|255}}
%tmp1 = or <2 x i64> %a, < i64 1095216660735, i64 1095216660735>
ret <2 x i64> %tmp1
}
define <2 x i64> @orr64imm4s_lsl8(<2 x i64> %a) {
-;CHECK: orr {{v[0-9]+}}.4s, #0xff, lsl #8
+; CHECK-LABEL: orr64imm4s_lsl8:
+; CHECK: orr {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #8
%tmp1 = or <2 x i64> %a, < i64 280375465148160, i64 280375465148160>
ret <2 x i64> %tmp1
}
define <2 x i64> @orr64imm4s_lsl16(<2 x i64> %a) {
-;CHECK: orr {{v[0-9]+}}.4s, #0xff, lsl #16
+; CHECK-LABEL: orr64imm4s_lsl16:
+; CHECK: orr {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #16
%tmp1 = or <2 x i64> %a, < i64 71776119077928960, i64 71776119077928960>
ret <2 x i64> %tmp1
}
define <2 x i64> @orr64imm4s_lsl24(<2 x i64> %a) {
-;CHECK: orr {{v[0-9]+}}.4s, #0xff, lsl #24
+; CHECK-LABEL: orr64imm4s_lsl24:
+; CHECK: orr {{v[0-9]+}}.4s, #{{0xff|255}}, lsl #24
%tmp1 = or <2 x i64> %a, < i64 -72057589759737856, i64 -72057589759737856>
ret <2 x i64> %tmp1
}
define <8 x i8> @orr8imm4h_lsl0(<8 x i8> %a) {
-;CHECK: orr {{v[0-9]+}}.4h, #0xff
+; CHECK-LABEL: orr8imm4h_lsl0:
+; CHECK: orr {{v[0-9]+}}.4h, #{{0xff|255}}
%tmp1 = or <8 x i8> %a, < i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0>
ret <8 x i8> %tmp1
}
define <8 x i8> @orr8imm4h_lsl8(<8 x i8> %a) {
-;CHECK: orr {{v[0-9]+}}.4h, #0xff, lsl #8
+; CHECK-LABEL: orr8imm4h_lsl8:
+; CHECK: orr {{v[0-9]+}}.4h, #{{0xff|255}}, lsl #8
%tmp1 = or <8 x i8> %a, < i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255>
ret <8 x i8> %tmp1
}
define <2 x i32> @orr16imm4h_lsl0(<2 x i32> %a) {
-;CHECK: orr {{v[0-9]+}}.4h, #0xff
+; CHECK-LABEL: orr16imm4h_lsl0:
+; CHECK: orr {{v[0-9]+}}.4h, #{{0xff|255}}
%tmp1 = or <2 x i32> %a, < i32 16711935, i32 16711935>
ret <2 x i32> %tmp1
}
define <2 x i32> @orr16imm4h_lsl8(<2 x i32> %a) {
-;CHECK: orr {{v[0-9]+}}.4h, #0xff, lsl #8
+; CHECK-LABEL: orr16imm4h_lsl8:
+; CHECK: orr {{v[0-9]+}}.4h, #{{0xff|255}}, lsl #8
%tmp1 = or <2 x i32> %a, < i32 4278255360, i32 4278255360>
ret <2 x i32> %tmp1
}
define <1 x i64> @orr64imm4h_lsl0(<1 x i64> %a) {
-;CHECK: orr {{v[0-9]+}}.4h, #0xff
+; CHECK-LABEL: orr64imm4h_lsl0:
+; CHECK: orr {{v[0-9]+}}.4h, #{{0xff|255}}
%tmp1 = or <1 x i64> %a, < i64 71777214294589695>
ret <1 x i64> %tmp1
}
define <1 x i64> @orr64imm4h_lsl8(<1 x i64> %a) {
-;CHECK: orr {{v[0-9]+}}.4h, #0xff, lsl #8
+; CHECK-LABEL: orr64imm4h_lsl8:
+; CHECK: orr {{v[0-9]+}}.4h, #{{0xff|255}}, lsl #8
%tmp1 = or <1 x i64> %a, < i64 -71777214294589696>
ret <1 x i64> %tmp1
}
define <16 x i8> @orr8imm8h_lsl0(<16 x i8> %a) {
-;CHECK: orr {{v[0-9]+}}.8h, #0xff
+; CHECK-LABEL: orr8imm8h_lsl0:
+; CHECK: orr {{v[0-9]+}}.8h, #{{0xff|255}}
%tmp1 = or <16 x i8> %a, < i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0>
ret <16 x i8> %tmp1
}
define <16 x i8> @orr8imm8h_lsl8(<16 x i8> %a) {
-;CHECK: orr {{v[0-9]+}}.8h, #0xff, lsl #8
+; CHECK-LABEL: orr8imm8h_lsl8:
+; CHECK: orr {{v[0-9]+}}.8h, #{{0xff|255}}, lsl #8
%tmp1 = or <16 x i8> %a, < i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255>
ret <16 x i8> %tmp1
}
define <4 x i32> @orr16imm8h_lsl0(<4 x i32> %a) {
-;CHECK: orr {{v[0-9]+}}.8h, #0xff
+; CHECK-LABEL: orr16imm8h_lsl0:
+; CHECK: orr {{v[0-9]+}}.8h, #{{0xff|255}}
%tmp1 = or <4 x i32> %a, < i32 16711935, i32 16711935, i32 16711935, i32 16711935>
ret <4 x i32> %tmp1
}
define <4 x i32> @orr16imm8h_lsl8(<4 x i32> %a) {
-;CHECK: orr {{v[0-9]+}}.8h, #0xff, lsl #8
+; CHECK-LABEL: orr16imm8h_lsl8:
+; CHECK: orr {{v[0-9]+}}.8h, #{{0xff|255}}, lsl #8
%tmp1 = or <4 x i32> %a, < i32 4278255360, i32 4278255360, i32 4278255360, i32 4278255360>
ret <4 x i32> %tmp1
}
define <2 x i64> @orr64imm8h_lsl0(<2 x i64> %a) {
-;CHECK: orr {{v[0-9]+}}.8h, #0xff
+; CHECK-LABEL: orr64imm8h_lsl0:
+; CHECK: orr {{v[0-9]+}}.8h, #{{0xff|255}}
%tmp1 = or <2 x i64> %a, < i64 71777214294589695, i64 71777214294589695>
ret <2 x i64> %tmp1
}
define <2 x i64> @orr64imm8h_lsl8(<2 x i64> %a) {
-;CHECK: orr {{v[0-9]+}}.8h, #0xff, lsl #8
+; CHECK-LABEL: orr64imm8h_lsl8:
+; CHECK: orr {{v[0-9]+}}.8h, #{{0xff|255}}, lsl #8
%tmp1 = or <2 x i64> %a, < i64 -71777214294589696, i64 -71777214294589696>
ret <2 x i64> %tmp1
}
diff --git a/test/CodeGen/AArch64/neon-bsl.ll b/test/CodeGen/AArch64/neon-bsl.ll
deleted file mode 100644
index c55fd01..0000000
--- a/test/CodeGen/AArch64/neon-bsl.ll
+++ /dev/null
@@ -1,235 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-declare <2 x double> @llvm.arm.neon.vbsl.v2f64(<2 x double>, <2 x double>, <2 x double>)
-
-declare <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
-
-declare <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
-
-declare <4 x float> @llvm.arm.neon.vbsl.v4f32(<4 x float>, <4 x float>, <4 x float>)
-
-declare <2 x i64> @llvm.arm.neon.vbsl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
-
-declare <4 x i32> @llvm.arm.neon.vbsl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
-
-declare <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16>, <4 x i16>, <4 x i16>)
-
-declare <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8>, <8 x i8>, <8 x i8>)
-
-declare <1 x double> @llvm.arm.neon.vbsl.v1f64(<1 x double>, <1 x double>, <1 x double>)
-
-declare <2 x float> @llvm.arm.neon.vbsl.v2f32(<2 x float>, <2 x float>, <2 x float>)
-
-declare <1 x i64> @llvm.arm.neon.vbsl.v1i64(<1 x i64>, <1 x i64>, <1 x i64>)
-
-declare <2 x i32> @llvm.arm.neon.vbsl.v2i32(<2 x i32>, <2 x i32>, <2 x i32>)
-
-define <8 x i8> @test_vbsl_s8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) {
-; CHECK-LABEL: test_vbsl_s8:
-; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
- %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3)
- ret <8 x i8> %vbsl.i
-}
-
-define <8 x i8> @test_vbsl_s16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) {
-; CHECK-LABEL: test_vbsl_s16:
-; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
- %vbsl3.i = tail call <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3)
- %0 = bitcast <4 x i16> %vbsl3.i to <8 x i8>
- ret <8 x i8> %0
-}
-
-define <2 x i32> @test_vbsl_s32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) {
-; CHECK-LABEL: test_vbsl_s32:
-; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
- %vbsl3.i = tail call <2 x i32> @llvm.arm.neon.vbsl.v2i32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3)
- ret <2 x i32> %vbsl3.i
-}
-
-define <1 x i64> @test_vbsl_s64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) {
-; CHECK-LABEL: test_vbsl_s64:
-; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
- %vbsl3.i = tail call <1 x i64> @llvm.arm.neon.vbsl.v1i64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3)
- ret <1 x i64> %vbsl3.i
-}
-
-define <8 x i8> @test_vbsl_u8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) {
-; CHECK-LABEL: test_vbsl_u8:
-; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
- %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3)
- ret <8 x i8> %vbsl.i
-}
-
-define <4 x i16> @test_vbsl_u16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) {
-; CHECK-LABEL: test_vbsl_u16:
-; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
- %vbsl3.i = tail call <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3)
- ret <4 x i16> %vbsl3.i
-}
-
-define <2 x i32> @test_vbsl_u32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) {
-; CHECK-LABEL: test_vbsl_u32:
-; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
- %vbsl3.i = tail call <2 x i32> @llvm.arm.neon.vbsl.v2i32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3)
- ret <2 x i32> %vbsl3.i
-}
-
-define <1 x i64> @test_vbsl_u64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) {
-; CHECK-LABEL: test_vbsl_u64:
-; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
- %vbsl3.i = tail call <1 x i64> @llvm.arm.neon.vbsl.v1i64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3)
- ret <1 x i64> %vbsl3.i
-}
-
-define <2 x float> @test_vbsl_f32(<2 x float> %v1, <2 x float> %v2, <2 x float> %v3) {
-; CHECK-LABEL: test_vbsl_f32:
-; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
- %vbsl3.i = tail call <2 x float> @llvm.arm.neon.vbsl.v2f32(<2 x float> %v1, <2 x float> %v2, <2 x float> %v3)
- ret <2 x float> %vbsl3.i
-}
-
-define <1 x double> @test_vbsl_f64(<1 x i64> %v1, <1 x double> %v2, <1 x double> %v3) {
-; CHECK-LABEL: test_vbsl_f64:
-; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
- %vbsl.i = bitcast <1 x i64> %v1 to <1 x double>
- %vbsl3.i = tail call <1 x double> @llvm.arm.neon.vbsl.v1f64(<1 x double> %vbsl.i, <1 x double> %v2, <1 x double> %v3)
- ret <1 x double> %vbsl3.i
-}
-
-define <8 x i8> @test_vbsl_p8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) {
-; CHECK-LABEL: test_vbsl_p8:
-; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
- %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3)
- ret <8 x i8> %vbsl.i
-}
-
-define <4 x i16> @test_vbsl_p16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) {
-; CHECK-LABEL: test_vbsl_p16:
-; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-entry:
- %vbsl3.i = tail call <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3)
- ret <4 x i16> %vbsl3.i
-}
-
-define <16 x i8> @test_vbslq_s8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) {
-; CHECK-LABEL: test_vbslq_s8:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
- %vbsl.i = tail call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3)
- ret <16 x i8> %vbsl.i
-}
-
-define <8 x i16> @test_vbslq_s16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) {
-; CHECK-LABEL: test_vbslq_s16:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
- %vbsl3.i = tail call <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3)
- ret <8 x i16> %vbsl3.i
-}
-
-define <4 x i32> @test_vbslq_s32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
-; CHECK-LABEL: test_vbslq_s32:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
- %vbsl3.i = tail call <4 x i32> @llvm.arm.neon.vbsl.v4i32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3)
- ret <4 x i32> %vbsl3.i
-}
-
-define <2 x i64> @test_vbslq_s64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3) {
-; CHECK-LABEL: test_vbslq_s64:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
- %vbsl3.i = tail call <2 x i64> @llvm.arm.neon.vbsl.v2i64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3)
- ret <2 x i64> %vbsl3.i
-}
-
-define <16 x i8> @test_vbslq_u8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) {
-; CHECK-LABEL: test_vbslq_u8:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
- %vbsl.i = tail call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3)
- ret <16 x i8> %vbsl.i
-}
-
-define <8 x i16> @test_vbslq_u16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) {
-; CHECK-LABEL: test_vbslq_u16:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
- %vbsl3.i = tail call <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3)
- ret <8 x i16> %vbsl3.i
-}
-
-define <4 x i32> @test_vbslq_u32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
-; CHECK-LABEL: test_vbslq_u32:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
- %vbsl3.i = tail call <4 x i32> @llvm.arm.neon.vbsl.v4i32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3)
- ret <4 x i32> %vbsl3.i
-}
-
-define <2 x i64> @test_vbslq_u64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3) {
-; CHECK-LABEL: test_vbslq_u64:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
- %vbsl3.i = tail call <2 x i64> @llvm.arm.neon.vbsl.v2i64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3)
- ret <2 x i64> %vbsl3.i
-}
-
-define <4 x float> @test_vbslq_f32(<4 x i32> %v1, <4 x float> %v2, <4 x float> %v3) {
-; CHECK-LABEL: test_vbslq_f32:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
- %vbsl.i = bitcast <4 x i32> %v1 to <4 x float>
- %vbsl3.i = tail call <4 x float> @llvm.arm.neon.vbsl.v4f32(<4 x float> %vbsl.i, <4 x float> %v2, <4 x float> %v3)
- ret <4 x float> %vbsl3.i
-}
-
-define <16 x i8> @test_vbslq_p8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) {
-; CHECK-LABEL: test_vbslq_p8:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
- %vbsl.i = tail call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3)
- ret <16 x i8> %vbsl.i
-}
-
-define <8 x i16> @test_vbslq_p16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) {
-; CHECK-LABEL: test_vbslq_p16:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
- %vbsl3.i = tail call <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3)
- ret <8 x i16> %vbsl3.i
-}
-
-define <2 x double> @test_vbslq_f64(<2 x i64> %v1, <2 x double> %v2, <2 x double> %v3) {
-; CHECK-LABEL: test_vbslq_f64:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
- %vbsl.i = bitcast <2 x i64> %v1 to <2 x double>
- %vbsl3.i = tail call <2 x double> @llvm.arm.neon.vbsl.v2f64(<2 x double> %vbsl.i, <2 x double> %v2, <2 x double> %v3)
- ret <2 x double> %vbsl3.i
-}
-
-define <2 x double> @test_bsl_v2f64(<2 x i1> %v1, <2 x double> %v2, <2 x double> %v3) {
-; CHECK-LABEL: test_bsl_v2f64:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
- %1 = select <2 x i1> %v1, <2 x double> %v2, <2 x double> %v3
- ret <2 x double> %1
-}
-
-define <4 x float> @test_bsl_v4f32(<4 x i1> %v1, <4 x float> %v2, <4 x float> %v3) {
-; CHECK-LABEL: test_bsl_v4f32:
-; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
- %1 = select <4 x i1> %v1, <4 x float> %v2, <4 x float> %v3
- ret <4 x float> %1
-}
diff --git a/test/CodeGen/AArch64/neon-compare-instructions.ll b/test/CodeGen/AArch64/neon-compare-instructions.ll
index 68f0342..6d89dfb 100644
--- a/test/CodeGen/AArch64/neon-compare-instructions.ll
+++ b/test/CodeGen/AArch64/neon-compare-instructions.ll
@@ -1,560 +1,631 @@
; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
define <8 x i8> @cmeq8xi8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: cmeq8xi8:
+; CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp3 = icmp eq <8 x i8> %A, %B;
%tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
ret <8 x i8> %tmp4
}
define <16 x i8> @cmeq16xi8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: cmeq16xi8:
+; CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = icmp eq <16 x i8> %A, %B;
%tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
ret <16 x i8> %tmp4
}
define <4 x i16> @cmeq4xi16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK-LABEL: cmeq4xi16:
+; CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
%tmp3 = icmp eq <4 x i16> %A, %B;
%tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
ret <4 x i16> %tmp4
}
define <8 x i16> @cmeq8xi16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK-LABEL: cmeq8xi16:
+; CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
%tmp3 = icmp eq <8 x i16> %A, %B;
%tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
ret <8 x i16> %tmp4
}
define <2 x i32> @cmeq2xi32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK-LABEL: cmeq2xi32:
+; CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
%tmp3 = icmp eq <2 x i32> %A, %B;
%tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
ret <2 x i32> %tmp4
}
define <4 x i32> @cmeq4xi32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-LABEL: cmeq4xi32:
+; CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
%tmp3 = icmp eq <4 x i32> %A, %B;
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i64> @cmeq2xi64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK-LABEL: cmeq2xi64:
+; CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
%tmp3 = icmp eq <2 x i64> %A, %B;
%tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
ret <2 x i64> %tmp4
}
define <8 x i8> @cmne8xi8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: cmne8xi8:
+; CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp3 = icmp ne <8 x i8> %A, %B;
%tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
ret <8 x i8> %tmp4
}
define <16 x i8> @cmne16xi8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: cmne16xi8:
+; CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = icmp ne <16 x i8> %A, %B;
%tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
ret <16 x i8> %tmp4
}
define <4 x i16> @cmne4xi16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: cmne4xi16:
+; CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp3 = icmp ne <4 x i16> %A, %B;
%tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
ret <4 x i16> %tmp4
}
define <8 x i16> @cmne8xi16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: cmne8xi16:
+; CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = icmp ne <8 x i16> %A, %B;
%tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
ret <8 x i16> %tmp4
}
define <2 x i32> @cmne2xi32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: cmne2xi32:
+; CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp3 = icmp ne <2 x i32> %A, %B;
%tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
ret <2 x i32> %tmp4
}
define <4 x i32> @cmne4xi32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: cmne4xi32:
+; CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = icmp ne <4 x i32> %A, %B;
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i64> @cmne2xi64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: cmne2xi64:
+; CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = icmp ne <2 x i64> %A, %B;
%tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
ret <2 x i64> %tmp4
}
define <8 x i8> @cmgt8xi8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: cmgt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: cmgt8xi8:
+; CHECK: cmgt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp3 = icmp sgt <8 x i8> %A, %B;
%tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
ret <8 x i8> %tmp4
}
define <16 x i8> @cmgt16xi8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: cmgt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: cmgt16xi8:
+; CHECK: cmgt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = icmp sgt <16 x i8> %A, %B;
%tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
ret <16 x i8> %tmp4
}
define <4 x i16> @cmgt4xi16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: cmgt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK-LABEL: cmgt4xi16:
+; CHECK: cmgt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
%tmp3 = icmp sgt <4 x i16> %A, %B;
%tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
ret <4 x i16> %tmp4
}
define <8 x i16> @cmgt8xi16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: cmgt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK-LABEL: cmgt8xi16:
+; CHECK: cmgt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
%tmp3 = icmp sgt <8 x i16> %A, %B;
%tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
ret <8 x i16> %tmp4
}
define <2 x i32> @cmgt2xi32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK-LABEL: cmgt2xi32:
+; CHECK: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
%tmp3 = icmp sgt <2 x i32> %A, %B;
%tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
ret <2 x i32> %tmp4
}
define <4 x i32> @cmgt4xi32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-LABEL: cmgt4xi32:
+; CHECK: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
%tmp3 = icmp sgt <4 x i32> %A, %B;
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i64> @cmgt2xi64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: cmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK-LABEL: cmgt2xi64:
+; CHECK: cmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
%tmp3 = icmp sgt <2 x i64> %A, %B;
%tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
ret <2 x i64> %tmp4
}
define <8 x i8> @cmlt8xi8(<8 x i8> %A, <8 x i8> %B) {
+; CHECK-LABEL: cmlt8xi8:
; Using registers other than v0, v1 are possible, but would be odd.
; LT implemented as GT, so check reversed operands.
-;CHECK: cmgt {{v[0-9]+}}.8b, v1.8b, v0.8b
+; CHECK: cmgt {{v[0-9]+}}.8b, v1.8b, v0.8b
%tmp3 = icmp slt <8 x i8> %A, %B;
%tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
ret <8 x i8> %tmp4
}
define <16 x i8> @cmlt16xi8(<16 x i8> %A, <16 x i8> %B) {
+; CHECK-LABEL: cmlt16xi8:
; Using registers other than v0, v1 are possible, but would be odd.
; LT implemented as GT, so check reversed operands.
-;CHECK: cmgt {{v[0-9]+}}.16b, v1.16b, v0.16b
+; CHECK: cmgt {{v[0-9]+}}.16b, v1.16b, v0.16b
%tmp3 = icmp slt <16 x i8> %A, %B;
%tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
ret <16 x i8> %tmp4
}
define <4 x i16> @cmlt4xi16(<4 x i16> %A, <4 x i16> %B) {
+; CHECK-LABEL: cmlt4xi16:
; Using registers other than v0, v1 are possible, but would be odd.
; LT implemented as GT, so check reversed operands.
-;CHECK: cmgt {{v[0-9]+}}.4h, v1.4h, v0.4h
+; CHECK: cmgt {{v[0-9]+}}.4h, v1.4h, v0.4h
%tmp3 = icmp slt <4 x i16> %A, %B;
%tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
ret <4 x i16> %tmp4
}
define <8 x i16> @cmlt8xi16(<8 x i16> %A, <8 x i16> %B) {
+; CHECK-LABEL: cmlt8xi16:
; Using registers other than v0, v1 are possible, but would be odd.
; LT implemented as GT, so check reversed operands.
-;CHECK: cmgt {{v[0-9]+}}.8h, v1.8h, v0.8h
+; CHECK: cmgt {{v[0-9]+}}.8h, v1.8h, v0.8h
%tmp3 = icmp slt <8 x i16> %A, %B;
%tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
ret <8 x i16> %tmp4
}
define <2 x i32> @cmlt2xi32(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: cmlt2xi32:
; Using registers other than v0, v1 are possible, but would be odd.
; LT implemented as GT, so check reversed operands.
-;CHECK: cmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
+; CHECK: cmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
%tmp3 = icmp slt <2 x i32> %A, %B;
%tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
ret <2 x i32> %tmp4
}
define <4 x i32> @cmlt4xi32(<4 x i32> %A, <4 x i32> %B) {
+; CHECK-LABEL: cmlt4xi32:
; Using registers other than v0, v1 are possible, but would be odd.
; LT implemented as GT, so check reversed operands.
-;CHECK: cmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
+; CHECK: cmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
%tmp3 = icmp slt <4 x i32> %A, %B;
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i64> @cmlt2xi64(<2 x i64> %A, <2 x i64> %B) {
+; CHECK-LABEL: cmlt2xi64:
; Using registers other than v0, v1 are possible, but would be odd.
; LT implemented as GT, so check reversed operands.
-;CHECK: cmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
+; CHECK: cmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
%tmp3 = icmp slt <2 x i64> %A, %B;
%tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
ret <2 x i64> %tmp4
}
define <8 x i8> @cmge8xi8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: cmge {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: cmge8xi8:
+; CHECK: cmge {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp3 = icmp sge <8 x i8> %A, %B;
%tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
ret <8 x i8> %tmp4
}
define <16 x i8> @cmge16xi8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: cmge {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: cmge16xi8:
+; CHECK: cmge {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = icmp sge <16 x i8> %A, %B;
%tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
ret <16 x i8> %tmp4
}
define <4 x i16> @cmge4xi16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: cmge {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK-LABEL: cmge4xi16:
+; CHECK: cmge {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
%tmp3 = icmp sge <4 x i16> %A, %B;
%tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
ret <4 x i16> %tmp4
}
define <8 x i16> @cmge8xi16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: cmge {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK-LABEL: cmge8xi16:
+; CHECK: cmge {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
%tmp3 = icmp sge <8 x i16> %A, %B;
%tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
ret <8 x i16> %tmp4
}
define <2 x i32> @cmge2xi32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: cmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK-LABEL: cmge2xi32:
+; CHECK: cmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
%tmp3 = icmp sge <2 x i32> %A, %B;
%tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
ret <2 x i32> %tmp4
}
define <4 x i32> @cmge4xi32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: cmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-LABEL: cmge4xi32:
+; CHECK: cmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
%tmp3 = icmp sge <4 x i32> %A, %B;
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i64> @cmge2xi64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: cmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK-LABEL: cmge2xi64:
+; CHECK: cmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
%tmp3 = icmp sge <2 x i64> %A, %B;
%tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
ret <2 x i64> %tmp4
}
define <8 x i8> @cmle8xi8(<8 x i8> %A, <8 x i8> %B) {
+; CHECK-LABEL: cmle8xi8:
; Using registers other than v0, v1 are possible, but would be odd.
; LE implemented as GE, so check reversed operands.
-;CHECK: cmge {{v[0-9]+}}.8b, v1.8b, v0.8b
+; CHECK: cmge {{v[0-9]+}}.8b, v1.8b, v0.8b
%tmp3 = icmp sle <8 x i8> %A, %B;
%tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
ret <8 x i8> %tmp4
}
define <16 x i8> @cmle16xi8(<16 x i8> %A, <16 x i8> %B) {
+; CHECK-LABEL: cmle16xi8:
; Using registers other than v0, v1 are possible, but would be odd.
; LE implemented as GE, so check reversed operands.
-;CHECK: cmge {{v[0-9]+}}.16b, v1.16b, v0.16b
+; CHECK: cmge {{v[0-9]+}}.16b, v1.16b, v0.16b
%tmp3 = icmp sle <16 x i8> %A, %B;
%tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
ret <16 x i8> %tmp4
}
define <4 x i16> @cmle4xi16(<4 x i16> %A, <4 x i16> %B) {
+; CHECK-LABEL: cmle4xi16:
; Using registers other than v0, v1 are possible, but would be odd.
; LE implemented as GE, so check reversed operands.
-;CHECK: cmge {{v[0-9]+}}.4h, v1.4h, v0.4h
+; CHECK: cmge {{v[0-9]+}}.4h, v1.4h, v0.4h
%tmp3 = icmp sle <4 x i16> %A, %B;
%tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
ret <4 x i16> %tmp4
}
define <8 x i16> @cmle8xi16(<8 x i16> %A, <8 x i16> %B) {
+; CHECK-LABEL: cmle8xi16:
; Using registers other than v0, v1 are possible, but would be odd.
; LE implemented as GE, so check reversed operands.
-;CHECK: cmge {{v[0-9]+}}.8h, v1.8h, v0.8h
+; CHECK: cmge {{v[0-9]+}}.8h, v1.8h, v0.8h
%tmp3 = icmp sle <8 x i16> %A, %B;
%tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
ret <8 x i16> %tmp4
}
define <2 x i32> @cmle2xi32(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: cmle2xi32:
; Using registers other than v0, v1 are possible, but would be odd.
; LE implemented as GE, so check reversed operands.
-;CHECK: cmge {{v[0-9]+}}.2s, v1.2s, v0.2s
+; CHECK: cmge {{v[0-9]+}}.2s, v1.2s, v0.2s
%tmp3 = icmp sle <2 x i32> %A, %B;
%tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
ret <2 x i32> %tmp4
}
define <4 x i32> @cmle4xi32(<4 x i32> %A, <4 x i32> %B) {
+; CHECK-LABEL: cmle4xi32:
; Using registers other than v0, v1 are possible, but would be odd.
; LE implemented as GE, so check reversed operands.
-;CHECK: cmge {{v[0-9]+}}.4s, v1.4s, v0.4s
+; CHECK: cmge {{v[0-9]+}}.4s, v1.4s, v0.4s
%tmp3 = icmp sle <4 x i32> %A, %B;
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i64> @cmle2xi64(<2 x i64> %A, <2 x i64> %B) {
+; CHECK-LABEL: cmle2xi64:
; Using registers other than v0, v1 are possible, but would be odd.
; LE implemented as GE, so check reversed operands.
-;CHECK: cmge {{v[0-9]+}}.2d, v1.2d, v0.2d
+; CHECK: cmge {{v[0-9]+}}.2d, v1.2d, v0.2d
%tmp3 = icmp sle <2 x i64> %A, %B;
%tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
ret <2 x i64> %tmp4
}
define <8 x i8> @cmhi8xi8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: cmhi {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: cmhi8xi8:
+; CHECK: cmhi {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp3 = icmp ugt <8 x i8> %A, %B;
%tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
ret <8 x i8> %tmp4
}
define <16 x i8> @cmhi16xi8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: cmhi {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: cmhi16xi8:
+; CHECK: cmhi {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = icmp ugt <16 x i8> %A, %B;
%tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
ret <16 x i8> %tmp4
}
define <4 x i16> @cmhi4xi16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: cmhi {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK-LABEL: cmhi4xi16:
+; CHECK: cmhi {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
%tmp3 = icmp ugt <4 x i16> %A, %B;
%tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
ret <4 x i16> %tmp4
}
define <8 x i16> @cmhi8xi16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: cmhi {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK-LABEL: cmhi8xi16:
+; CHECK: cmhi {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
%tmp3 = icmp ugt <8 x i16> %A, %B;
%tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
ret <8 x i16> %tmp4
}
define <2 x i32> @cmhi2xi32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: cmhi {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK-LABEL: cmhi2xi32:
+; CHECK: cmhi {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
%tmp3 = icmp ugt <2 x i32> %A, %B;
%tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
ret <2 x i32> %tmp4
}
define <4 x i32> @cmhi4xi32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: cmhi {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-LABEL: cmhi4xi32:
+; CHECK: cmhi {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
%tmp3 = icmp ugt <4 x i32> %A, %B;
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i64> @cmhi2xi64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: cmhi {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK-LABEL: cmhi2xi64:
+; CHECK: cmhi {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
%tmp3 = icmp ugt <2 x i64> %A, %B;
%tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
ret <2 x i64> %tmp4
}
define <8 x i8> @cmlo8xi8(<8 x i8> %A, <8 x i8> %B) {
+; CHECK-LABEL: cmlo8xi8:
; Using registers other than v0, v1 are possible, but would be odd.
; LO implemented as HI, so check reversed operands.
-;CHECK: cmhi {{v[0-9]+}}.8b, v1.8b, v0.8b
+; CHECK: cmhi {{v[0-9]+}}.8b, v1.8b, v0.8b
%tmp3 = icmp ult <8 x i8> %A, %B;
%tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
ret <8 x i8> %tmp4
}
define <16 x i8> @cmlo16xi8(<16 x i8> %A, <16 x i8> %B) {
+; CHECK-LABEL: cmlo16xi8:
; Using registers other than v0, v1 are possible, but would be odd.
; LO implemented as HI, so check reversed operands.
-;CHECK: cmhi {{v[0-9]+}}.16b, v1.16b, v0.16b
+; CHECK: cmhi {{v[0-9]+}}.16b, v1.16b, v0.16b
%tmp3 = icmp ult <16 x i8> %A, %B;
%tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
ret <16 x i8> %tmp4
}
define <4 x i16> @cmlo4xi16(<4 x i16> %A, <4 x i16> %B) {
+; CHECK-LABEL: cmlo4xi16:
; Using registers other than v0, v1 are possible, but would be odd.
; LO implemented as HI, so check reversed operands.
-;CHECK: cmhi {{v[0-9]+}}.4h, v1.4h, v0.4h
+; CHECK: cmhi {{v[0-9]+}}.4h, v1.4h, v0.4h
%tmp3 = icmp ult <4 x i16> %A, %B;
%tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
ret <4 x i16> %tmp4
}
define <8 x i16> @cmlo8xi16(<8 x i16> %A, <8 x i16> %B) {
+; CHECK-LABEL: cmlo8xi16:
; Using registers other than v0, v1 are possible, but would be odd.
; LO implemented as HI, so check reversed operands.
-;CHECK: cmhi {{v[0-9]+}}.8h, v1.8h, v0.8h
+; CHECK: cmhi {{v[0-9]+}}.8h, v1.8h, v0.8h
%tmp3 = icmp ult <8 x i16> %A, %B;
%tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
ret <8 x i16> %tmp4
}
define <2 x i32> @cmlo2xi32(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: cmlo2xi32:
; Using registers other than v0, v1 are possible, but would be odd.
; LO implemented as HI, so check reversed operands.
-;CHECK: cmhi {{v[0-9]+}}.2s, v1.2s, v0.2s
+; CHECK: cmhi {{v[0-9]+}}.2s, v1.2s, v0.2s
%tmp3 = icmp ult <2 x i32> %A, %B;
%tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
ret <2 x i32> %tmp4
}
define <4 x i32> @cmlo4xi32(<4 x i32> %A, <4 x i32> %B) {
+; CHECK-LABEL: cmlo4xi32:
; Using registers other than v0, v1 are possible, but would be odd.
; LO implemented as HI, so check reversed operands.
-;CHECK: cmhi {{v[0-9]+}}.4s, v1.4s, v0.4s
+; CHECK: cmhi {{v[0-9]+}}.4s, v1.4s, v0.4s
%tmp3 = icmp ult <4 x i32> %A, %B;
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i64> @cmlo2xi64(<2 x i64> %A, <2 x i64> %B) {
+; CHECK-LABEL: cmlo2xi64:
; Using registers other than v0, v1 are possible, but would be odd.
; LO implemented as HI, so check reversed operands.
-;CHECK: cmhi {{v[0-9]+}}.2d, v1.2d, v0.2d
+; CHECK: cmhi {{v[0-9]+}}.2d, v1.2d, v0.2d
%tmp3 = icmp ult <2 x i64> %A, %B;
%tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
ret <2 x i64> %tmp4
}
define <8 x i8> @cmhs8xi8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: cmhs8xi8:
+; CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp3 = icmp uge <8 x i8> %A, %B;
%tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
ret <8 x i8> %tmp4
}
define <16 x i8> @cmhs16xi8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: cmhs16xi8:
+; CHECK: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = icmp uge <16 x i8> %A, %B;
%tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
ret <16 x i8> %tmp4
}
define <4 x i16> @cmhs4xi16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: cmhs {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK-LABEL: cmhs4xi16:
+; CHECK: cmhs {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
%tmp3 = icmp uge <4 x i16> %A, %B;
%tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
ret <4 x i16> %tmp4
}
define <8 x i16> @cmhs8xi16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: cmhs {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK-LABEL: cmhs8xi16:
+; CHECK: cmhs {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
%tmp3 = icmp uge <8 x i16> %A, %B;
%tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
ret <8 x i16> %tmp4
}
define <2 x i32> @cmhs2xi32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: cmhs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK-LABEL: cmhs2xi32:
+; CHECK: cmhs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
%tmp3 = icmp uge <2 x i32> %A, %B;
%tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
ret <2 x i32> %tmp4
}
define <4 x i32> @cmhs4xi32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: cmhs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-LABEL: cmhs4xi32:
+; CHECK: cmhs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
%tmp3 = icmp uge <4 x i32> %A, %B;
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i64> @cmhs2xi64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: cmhs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK-LABEL: cmhs2xi64:
+; CHECK: cmhs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
%tmp3 = icmp uge <2 x i64> %A, %B;
%tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
ret <2 x i64> %tmp4
}
define <8 x i8> @cmls8xi8(<8 x i8> %A, <8 x i8> %B) {
+; CHECK-LABEL: cmls8xi8:
; Using registers other than v0, v1 are possible, but would be odd.
; LS implemented as HS, so check reversed operands.
-;CHECK: cmhs {{v[0-9]+}}.8b, v1.8b, v0.8b
+; CHECK: cmhs {{v[0-9]+}}.8b, v1.8b, v0.8b
%tmp3 = icmp ule <8 x i8> %A, %B;
%tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
ret <8 x i8> %tmp4
}
define <16 x i8> @cmls16xi8(<16 x i8> %A, <16 x i8> %B) {
+; CHECK-LABEL: cmls16xi8:
; Using registers other than v0, v1 are possible, but would be odd.
; LS implemented as HS, so check reversed operands.
-;CHECK: cmhs {{v[0-9]+}}.16b, v1.16b, v0.16b
+; CHECK: cmhs {{v[0-9]+}}.16b, v1.16b, v0.16b
%tmp3 = icmp ule <16 x i8> %A, %B;
%tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
ret <16 x i8> %tmp4
}
define <4 x i16> @cmls4xi16(<4 x i16> %A, <4 x i16> %B) {
+; CHECK-LABEL: cmls4xi16:
; Using registers other than v0, v1 are possible, but would be odd.
; LS implemented as HS, so check reversed operands.
-;CHECK: cmhs {{v[0-9]+}}.4h, v1.4h, v0.4h
+; CHECK: cmhs {{v[0-9]+}}.4h, v1.4h, v0.4h
%tmp3 = icmp ule <4 x i16> %A, %B;
%tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
ret <4 x i16> %tmp4
}
define <8 x i16> @cmls8xi16(<8 x i16> %A, <8 x i16> %B) {
+; CHECK-LABEL: cmls8xi16:
; Using registers other than v0, v1 are possible, but would be odd.
; LS implemented as HS, so check reversed operands.
-;CHECK: cmhs {{v[0-9]+}}.8h, v1.8h, v0.8h
+; CHECK: cmhs {{v[0-9]+}}.8h, v1.8h, v0.8h
%tmp3 = icmp ule <8 x i16> %A, %B;
%tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
ret <8 x i16> %tmp4
}
define <2 x i32> @cmls2xi32(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: cmls2xi32:
; Using registers other than v0, v1 are possible, but would be odd.
; LS implemented as HS, so check reversed operands.
-;CHECK: cmhs {{v[0-9]+}}.2s, v1.2s, v0.2s
+; CHECK: cmhs {{v[0-9]+}}.2s, v1.2s, v0.2s
%tmp3 = icmp ule <2 x i32> %A, %B;
%tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
ret <2 x i32> %tmp4
}
define <4 x i32> @cmls4xi32(<4 x i32> %A, <4 x i32> %B) {
+; CHECK-LABEL: cmls4xi32:
; Using registers other than v0, v1 are possible, but would be odd.
; LS implemented as HS, so check reversed operands.
-;CHECK: cmhs {{v[0-9]+}}.4s, v1.4s, v0.4s
+; CHECK: cmhs {{v[0-9]+}}.4s, v1.4s, v0.4s
%tmp3 = icmp ule <4 x i32> %A, %B;
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i64> @cmls2xi64(<2 x i64> %A, <2 x i64> %B) {
+; CHECK-LABEL: cmls2xi64:
; Using registers other than v0, v1 are possible, but would be odd.
; LS implemented as HS, so check reversed operands.
-;CHECK: cmhs {{v[0-9]+}}.2d, v1.2d, v0.2d
+; CHECK: cmhs {{v[0-9]+}}.2d, v1.2d, v0.2d
%tmp3 = icmp ule <2 x i64> %A, %B;
%tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
ret <2 x i64> %tmp4
}
define <8 x i8> @cmtst8xi8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: cmtst {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: cmtst8xi8:
+; CHECK: cmtst {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp3 = and <8 x i8> %A, %B
%tmp4 = icmp ne <8 x i8> %tmp3, zeroinitializer
%tmp5 = sext <8 x i1> %tmp4 to <8 x i8>
@@ -562,7 +633,8 @@ define <8 x i8> @cmtst8xi8(<8 x i8> %A, <8 x i8> %B) {
}
define <16 x i8> @cmtst16xi8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: cmtst {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: cmtst16xi8:
+; CHECK: cmtst {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = and <16 x i8> %A, %B
%tmp4 = icmp ne <16 x i8> %tmp3, zeroinitializer
%tmp5 = sext <16 x i1> %tmp4 to <16 x i8>
@@ -570,7 +642,8 @@ define <16 x i8> @cmtst16xi8(<16 x i8> %A, <16 x i8> %B) {
}
define <4 x i16> @cmtst4xi16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: cmtst {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK-LABEL: cmtst4xi16:
+; CHECK: cmtst {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
%tmp3 = and <4 x i16> %A, %B
%tmp4 = icmp ne <4 x i16> %tmp3, zeroinitializer
%tmp5 = sext <4 x i1> %tmp4 to <4 x i16>
@@ -578,7 +651,8 @@ define <4 x i16> @cmtst4xi16(<4 x i16> %A, <4 x i16> %B) {
}
define <8 x i16> @cmtst8xi16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: cmtst {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK-LABEL: cmtst8xi16:
+; CHECK: cmtst {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
%tmp3 = and <8 x i16> %A, %B
%tmp4 = icmp ne <8 x i16> %tmp3, zeroinitializer
%tmp5 = sext <8 x i1> %tmp4 to <8 x i16>
@@ -586,7 +660,8 @@ define <8 x i16> @cmtst8xi16(<8 x i16> %A, <8 x i16> %B) {
}
define <2 x i32> @cmtst2xi32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: cmtst {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK-LABEL: cmtst2xi32:
+; CHECK: cmtst {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
%tmp3 = and <2 x i32> %A, %B
%tmp4 = icmp ne <2 x i32> %tmp3, zeroinitializer
%tmp5 = sext <2 x i1> %tmp4 to <2 x i32>
@@ -594,7 +669,8 @@ define <2 x i32> @cmtst2xi32(<2 x i32> %A, <2 x i32> %B) {
}
define <4 x i32> @cmtst4xi32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: cmtst {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-LABEL: cmtst4xi32:
+; CHECK: cmtst {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
%tmp3 = and <4 x i32> %A, %B
%tmp4 = icmp ne <4 x i32> %tmp3, zeroinitializer
%tmp5 = sext <4 x i1> %tmp4 to <4 x i32>
@@ -602,7 +678,8 @@ define <4 x i32> @cmtst4xi32(<4 x i32> %A, <4 x i32> %B) {
}
define <2 x i64> @cmtst2xi64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: cmtst {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK-LABEL: cmtst2xi64:
+; CHECK: cmtst {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
%tmp3 = and <2 x i64> %A, %B
%tmp4 = icmp ne <2 x i64> %tmp3, zeroinitializer
%tmp5 = sext <2 x i1> %tmp4 to <2 x i64>
@@ -612,49 +689,56 @@ define <2 x i64> @cmtst2xi64(<2 x i64> %A, <2 x i64> %B) {
define <8 x i8> @cmeqz8xi8(<8 x i8> %A) {
-;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0
+; CHECK-LABEL: cmeqz8xi8:
+; CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
%tmp3 = icmp eq <8 x i8> %A, zeroinitializer;
%tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
ret <8 x i8> %tmp4
}
define <16 x i8> @cmeqz16xi8(<16 x i8> %A) {
-;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0
+; CHECK-LABEL: cmeqz16xi8:
+; CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}}
%tmp3 = icmp eq <16 x i8> %A, zeroinitializer;
%tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
ret <16 x i8> %tmp4
}
define <4 x i16> @cmeqz4xi16(<4 x i16> %A) {
-;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0
+; CHECK-LABEL: cmeqz4xi16:
+; CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}}
%tmp3 = icmp eq <4 x i16> %A, zeroinitializer;
%tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
ret <4 x i16> %tmp4
}
define <8 x i16> @cmeqz8xi16(<8 x i16> %A) {
-;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0
+; CHECK-LABEL: cmeqz8xi16:
+; CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}}
%tmp3 = icmp eq <8 x i16> %A, zeroinitializer;
%tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
ret <8 x i16> %tmp4
}
define <2 x i32> @cmeqz2xi32(<2 x i32> %A) {
-;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0
+; CHECK-LABEL: cmeqz2xi32:
+; CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0x0|0}}
%tmp3 = icmp eq <2 x i32> %A, zeroinitializer;
%tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
ret <2 x i32> %tmp4
}
define <4 x i32> @cmeqz4xi32(<4 x i32> %A) {
-;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0
+; CHECK-LABEL: cmeqz4xi32:
+; CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0x0|0}}
%tmp3 = icmp eq <4 x i32> %A, zeroinitializer;
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i64> @cmeqz2xi64(<2 x i64> %A) {
-;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0
+; CHECK-LABEL: cmeqz2xi64:
+; CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0x0|0}}
%tmp3 = icmp eq <2 x i64> %A, zeroinitializer;
%tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
ret <2 x i64> %tmp4
@@ -662,49 +746,56 @@ define <2 x i64> @cmeqz2xi64(<2 x i64> %A) {
define <8 x i8> @cmgez8xi8(<8 x i8> %A) {
-;CHECK: cmge {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0
+; CHECK-LABEL: cmgez8xi8:
+; CHECK: cmge {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
%tmp3 = icmp sge <8 x i8> %A, zeroinitializer;
%tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
ret <8 x i8> %tmp4
}
define <16 x i8> @cmgez16xi8(<16 x i8> %A) {
-;CHECK: cmge {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0
+; CHECK-LABEL: cmgez16xi8:
+; CHECK: cmge {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}}
%tmp3 = icmp sge <16 x i8> %A, zeroinitializer;
%tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
ret <16 x i8> %tmp4
}
define <4 x i16> @cmgez4xi16(<4 x i16> %A) {
-;CHECK: cmge {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0
+; CHECK-LABEL: cmgez4xi16:
+; CHECK: cmge {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}}
%tmp3 = icmp sge <4 x i16> %A, zeroinitializer;
%tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
ret <4 x i16> %tmp4
}
define <8 x i16> @cmgez8xi16(<8 x i16> %A) {
-;CHECK: cmge {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0
+; CHECK-LABEL: cmgez8xi16:
+; CHECK: cmge {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}}
%tmp3 = icmp sge <8 x i16> %A, zeroinitializer;
%tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
ret <8 x i16> %tmp4
}
define <2 x i32> @cmgez2xi32(<2 x i32> %A) {
-;CHECK: cmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0
+; CHECK-LABEL: cmgez2xi32:
+; CHECK: cmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0x0|0}}
%tmp3 = icmp sge <2 x i32> %A, zeroinitializer;
%tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
ret <2 x i32> %tmp4
}
define <4 x i32> @cmgez4xi32(<4 x i32> %A) {
-;CHECK: cmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0
+; CHECK-LABEL: cmgez4xi32:
+; CHECK: cmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0x0|0}}
%tmp3 = icmp sge <4 x i32> %A, zeroinitializer;
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i64> @cmgez2xi64(<2 x i64> %A) {
-;CHECK: cmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0
+; CHECK-LABEL: cmgez2xi64:
+; CHECK: cmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0x0|0}}
%tmp3 = icmp sge <2 x i64> %A, zeroinitializer;
%tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
ret <2 x i64> %tmp4
@@ -712,259 +803,294 @@ define <2 x i64> @cmgez2xi64(<2 x i64> %A) {
define <8 x i8> @cmgtz8xi8(<8 x i8> %A) {
-;CHECK: cmgt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0
+; CHECK-LABEL: cmgtz8xi8:
+; CHECK: cmgt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
%tmp3 = icmp sgt <8 x i8> %A, zeroinitializer;
%tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
ret <8 x i8> %tmp4
}
define <16 x i8> @cmgtz16xi8(<16 x i8> %A) {
-;CHECK: cmgt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0
+; CHECK-LABEL: cmgtz16xi8:
+; CHECK: cmgt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}}
%tmp3 = icmp sgt <16 x i8> %A, zeroinitializer;
%tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
ret <16 x i8> %tmp4
}
define <4 x i16> @cmgtz4xi16(<4 x i16> %A) {
-;CHECK: cmgt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0
+; CHECK-LABEL: cmgtz4xi16:
+; CHECK: cmgt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}}
%tmp3 = icmp sgt <4 x i16> %A, zeroinitializer;
%tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
ret <4 x i16> %tmp4
}
define <8 x i16> @cmgtz8xi16(<8 x i16> %A) {
-;CHECK: cmgt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0
+; CHECK-LABEL: cmgtz8xi16:
+; CHECK: cmgt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}}
%tmp3 = icmp sgt <8 x i16> %A, zeroinitializer;
%tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
ret <8 x i16> %tmp4
}
define <2 x i32> @cmgtz2xi32(<2 x i32> %A) {
-;CHECK: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0
+; CHECK-LABEL: cmgtz2xi32:
+; CHECK: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0x0|0}}
%tmp3 = icmp sgt <2 x i32> %A, zeroinitializer;
%tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
ret <2 x i32> %tmp4
}
define <4 x i32> @cmgtz4xi32(<4 x i32> %A) {
-;CHECK: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0
+; CHECK-LABEL: cmgtz4xi32:
+; CHECK: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0x0|0}}
%tmp3 = icmp sgt <4 x i32> %A, zeroinitializer;
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i64> @cmgtz2xi64(<2 x i64> %A) {
-;CHECK: cmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0
+; CHECK-LABEL: cmgtz2xi64:
+; CHECK: cmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0x0|0}}
%tmp3 = icmp sgt <2 x i64> %A, zeroinitializer;
%tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
ret <2 x i64> %tmp4
}
define <8 x i8> @cmlez8xi8(<8 x i8> %A) {
-;CHECK: cmle {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0
+; CHECK-LABEL: cmlez8xi8:
+; CHECK: cmle {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
%tmp3 = icmp sle <8 x i8> %A, zeroinitializer;
%tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
ret <8 x i8> %tmp4
}
define <16 x i8> @cmlez16xi8(<16 x i8> %A) {
-;CHECK: cmle {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0
+; CHECK-LABEL: cmlez16xi8:
+; CHECK: cmle {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}}
%tmp3 = icmp sle <16 x i8> %A, zeroinitializer;
%tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
ret <16 x i8> %tmp4
}
define <4 x i16> @cmlez4xi16(<4 x i16> %A) {
-;CHECK: cmle {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0
+; CHECK-LABEL: cmlez4xi16:
+; CHECK: cmle {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}}
%tmp3 = icmp sle <4 x i16> %A, zeroinitializer;
%tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
ret <4 x i16> %tmp4
}
define <8 x i16> @cmlez8xi16(<8 x i16> %A) {
-;CHECK: cmle {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0
+; CHECK-LABEL: cmlez8xi16:
+; CHECK: cmle {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}}
%tmp3 = icmp sle <8 x i16> %A, zeroinitializer;
%tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
ret <8 x i16> %tmp4
}
define <2 x i32> @cmlez2xi32(<2 x i32> %A) {
-;CHECK: cmle {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0
+; CHECK-LABEL: cmlez2xi32:
+; CHECK: cmle {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0x0|0}}
%tmp3 = icmp sle <2 x i32> %A, zeroinitializer;
%tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
ret <2 x i32> %tmp4
}
define <4 x i32> @cmlez4xi32(<4 x i32> %A) {
-;CHECK: cmle {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0
+; CHECK-LABEL: cmlez4xi32:
+; CHECK: cmle {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0x0|0}}
%tmp3 = icmp sle <4 x i32> %A, zeroinitializer;
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i64> @cmlez2xi64(<2 x i64> %A) {
-;CHECK: cmle {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0
+; CHECK-LABEL: cmlez2xi64:
+; CHECK: cmle {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0x0|0}}
%tmp3 = icmp sle <2 x i64> %A, zeroinitializer;
%tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
ret <2 x i64> %tmp4
}
define <8 x i8> @cmltz8xi8(<8 x i8> %A) {
-;CHECK: cmlt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0
+; CHECK-LABEL: cmltz8xi8:
+; CHECK: cmlt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
%tmp3 = icmp slt <8 x i8> %A, zeroinitializer;
%tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
ret <8 x i8> %tmp4
}
define <16 x i8> @cmltz16xi8(<16 x i8> %A) {
-;CHECK: cmlt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0
+; CHECK-LABEL: cmltz16xi8:
+; CHECK: cmlt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}}
%tmp3 = icmp slt <16 x i8> %A, zeroinitializer;
%tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
ret <16 x i8> %tmp4
}
define <4 x i16> @cmltz4xi16(<4 x i16> %A) {
-;CHECK: cmlt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0
+; CHECK-LABEL: cmltz4xi16:
+; CHECK: cmlt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}}
%tmp3 = icmp slt <4 x i16> %A, zeroinitializer;
%tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
ret <4 x i16> %tmp4
}
define <8 x i16> @cmltz8xi16(<8 x i16> %A) {
-;CHECK: cmlt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0
+; CHECK-LABEL: cmltz8xi16:
+; CHECK: cmlt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}}
%tmp3 = icmp slt <8 x i16> %A, zeroinitializer;
%tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
ret <8 x i16> %tmp4
}
define <2 x i32> @cmltz2xi32(<2 x i32> %A) {
-;CHECK: cmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0
+; CHECK-LABEL: cmltz2xi32:
+; CHECK: cmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0x0|0}}
%tmp3 = icmp slt <2 x i32> %A, zeroinitializer;
%tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
ret <2 x i32> %tmp4
}
define <4 x i32> @cmltz4xi32(<4 x i32> %A) {
-;CHECK: cmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0
+; CHECK-LABEL: cmltz4xi32:
+; CHECK: cmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0x0|0}}
%tmp3 = icmp slt <4 x i32> %A, zeroinitializer;
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i64> @cmltz2xi64(<2 x i64> %A) {
-;CHECK: cmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0
+; CHECK-LABEL: cmltz2xi64:
+; CHECK: cmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0x0|0}}
%tmp3 = icmp slt <2 x i64> %A, zeroinitializer;
%tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
ret <2 x i64> %tmp4
}
define <8 x i8> @cmneqz8xi8(<8 x i8> %A) {
-;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: cmneqz8xi8:
+; CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp3 = icmp ne <8 x i8> %A, zeroinitializer;
%tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
ret <8 x i8> %tmp4
}
define <16 x i8> @cmneqz16xi8(<16 x i8> %A) {
-;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: cmneqz16xi8:
+; CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = icmp ne <16 x i8> %A, zeroinitializer;
%tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
ret <16 x i8> %tmp4
}
define <4 x i16> @cmneqz4xi16(<4 x i16> %A) {
-;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: cmneqz4xi16:
+; CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp3 = icmp ne <4 x i16> %A, zeroinitializer;
%tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
ret <4 x i16> %tmp4
}
define <8 x i16> @cmneqz8xi16(<8 x i16> %A) {
-;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: cmneqz8xi16:
+; CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = icmp ne <8 x i16> %A, zeroinitializer;
%tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
ret <8 x i16> %tmp4
}
define <2 x i32> @cmneqz2xi32(<2 x i32> %A) {
-;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: cmneqz2xi32:
+; CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0x0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp3 = icmp ne <2 x i32> %A, zeroinitializer;
%tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
ret <2 x i32> %tmp4
}
define <4 x i32> @cmneqz4xi32(<4 x i32> %A) {
-;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: cmneqz4xi32:
+; CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0x0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = icmp ne <4 x i32> %A, zeroinitializer;
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i64> @cmneqz2xi64(<2 x i64> %A) {
-;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: cmneqz2xi64:
+; CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0x0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = icmp ne <2 x i64> %A, zeroinitializer;
%tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
ret <2 x i64> %tmp4
}
define <8 x i8> @cmhsz8xi8(<8 x i8> %A) {
-;CHECK: movi {{v[0-9]+}}.8b, #0x0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: cmhsz8xi8:
+; CHECK: movi {{v[0-9]+.8b|d[0-9]+}}, #{{0x0|0}}
+; CHECK-NEXT: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp3 = icmp uge <8 x i8> %A, zeroinitializer;
%tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
ret <8 x i8> %tmp4
}
define <16 x i8> @cmhsz16xi8(<16 x i8> %A) {
-;CHECK: movi {{v[0-9]+}}.16b, #0x0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: cmhsz16xi8:
+; CHECK: movi {{v[0-9]+.(16b|2d)}}, #{{0x0|0}}
+; CHECK-NEXT: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = icmp uge <16 x i8> %A, zeroinitializer;
%tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
ret <16 x i8> %tmp4
}
define <4 x i16> @cmhsz4xi16(<4 x i16> %A) {
-;CHECK: movi {{v[0-9]+}}.8b, #0x0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK-LABEL: cmhsz4xi16:
+; CHECK: movi {{v[0-9]+.8b|d[0-9]+}}, #{{0x0|0}}
+; CHECK-NEXT: cmhs {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
%tmp3 = icmp uge <4 x i16> %A, zeroinitializer;
%tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
ret <4 x i16> %tmp4
}
define <8 x i16> @cmhsz8xi16(<8 x i16> %A) {
-;CHECK: movi {{v[0-9]+}}.16b, #0x0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK-LABEL: cmhsz8xi16:
+; CHECK: movi {{v[0-9]+.(16b|2d)}}, #{{0x0|0}}
+; CHECK-NEXT: cmhs {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
%tmp3 = icmp uge <8 x i16> %A, zeroinitializer;
%tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
ret <8 x i16> %tmp4
}
define <2 x i32> @cmhsz2xi32(<2 x i32> %A) {
-;CHECK: movi {{v[0-9]+}}.8b, #0x0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK-LABEL: cmhsz2xi32:
+; CHECK: movi {{v[0-9]+.8b|d[0-9]+}}, #{{0x0|0}}
+; CHECK-NEXT: cmhs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
%tmp3 = icmp uge <2 x i32> %A, zeroinitializer;
%tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
ret <2 x i32> %tmp4
}
define <4 x i32> @cmhsz4xi32(<4 x i32> %A) {
-;CHECK: movi {{v[0-9]+}}.16b, #0x0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-LABEL: cmhsz4xi32:
+; CHECK: movi {{v[0-9]+.(16b|2d)}}, #{{0x0|0}}
+; CHECK-NEXT: cmhs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
%tmp3 = icmp uge <4 x i32> %A, zeroinitializer;
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i64> @cmhsz2xi64(<2 x i64> %A) {
-;CHECK: movi {{v[0-9]+}}.16b, #0x0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK-LABEL: cmhsz2xi64:
+; CHECK: movi {{v[0-9]+.(16b|2d)}}, #{{0x0|0}}
+; CHECK-NEXT: cmhs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
%tmp3 = icmp uge <2 x i64> %A, zeroinitializer;
%tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
ret <2 x i64> %tmp4
@@ -972,196 +1098,217 @@ define <2 x i64> @cmhsz2xi64(<2 x i64> %A) {
define <8 x i8> @cmhiz8xi8(<8 x i8> %A) {
-;CHECK: movi {{v[0-9]+}}.8b, #0x0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: cmhiz8xi8:
+; CHECK: movi {{v[0-9]+.8b|d[0-9]+}}, #{{0x0|0}}
+; CHECK-NEXT: cmhi {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp3 = icmp ugt <8 x i8> %A, zeroinitializer;
%tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
ret <8 x i8> %tmp4
}
define <16 x i8> @cmhiz16xi8(<16 x i8> %A) {
-;CHECK: movi {{v[0-9]+}}.16b, #0x0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: cmhiz16xi8:
+; CHECK: movi {{v[0-9]+.(16b|2d)}}, #{{0x0|0}}
+; CHECK-NEXT: cmhi {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = icmp ugt <16 x i8> %A, zeroinitializer;
%tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
ret <16 x i8> %tmp4
}
define <4 x i16> @cmhiz4xi16(<4 x i16> %A) {
-;CHECK: movi {{v[0-9]+}}.8b, #0x0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK-LABEL: cmhiz4xi16:
+; CHECK: movi {{v[0-9]+.8b|d[0-9]+}}, #{{0x0|0}}
+; CHECK-NEXT: cmhi {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
%tmp3 = icmp ugt <4 x i16> %A, zeroinitializer;
%tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
ret <4 x i16> %tmp4
}
define <8 x i16> @cmhiz8xi16(<8 x i16> %A) {
-;CHECK: movi {{v[0-9]+}}.16b, #0x0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK-LABEL: cmhiz8xi16:
+; CHECK: movi {{v[0-9]+.(16b|2d)}}, #{{0x0|0}}
+; CHECK-NEXT: cmhi {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
%tmp3 = icmp ugt <8 x i16> %A, zeroinitializer;
%tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
ret <8 x i16> %tmp4
}
define <2 x i32> @cmhiz2xi32(<2 x i32> %A) {
-;CHECK: movi {{v[0-9]+}}.8b, #0x0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK-LABEL: cmhiz2xi32:
+; CHECK: movi {{v[0-9]+.8b|d[0-9]+}}, #{{0x0|0}}
+; CHECK-NEXT: cmhi {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
%tmp3 = icmp ugt <2 x i32> %A, zeroinitializer;
%tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
ret <2 x i32> %tmp4
}
define <4 x i32> @cmhiz4xi32(<4 x i32> %A) {
-;CHECK: movi {{v[0-9]+}}.16b, #0x0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-LABEL: cmhiz4xi32:
+; CHECK: movi {{v[0-9]+.(16b|2d)}}, #{{0x0|0}}
+; CHECK-NEXT: cmhi {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
%tmp3 = icmp ugt <4 x i32> %A, zeroinitializer;
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i64> @cmhiz2xi64(<2 x i64> %A) {
-;CHECK: movi {{v[0-9]+}}.16b, #0x0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK-LABEL: cmhiz2xi64:
+; CHECK: movi {{v[0-9]+.(16b|2d)}}, #{{0x0|0}}
+; CHECK-NEXT: cmhi {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
%tmp3 = icmp ugt <2 x i64> %A, zeroinitializer;
%tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
ret <2 x i64> %tmp4
}
define <8 x i8> @cmlsz8xi8(<8 x i8> %A) {
+; CHECK-LABEL: cmlsz8xi8:
; Using registers other than v0, v1 are possible, but would be odd.
; LS implemented as HS, so check reversed operands.
-;CHECK: movi v1.8b, #0x0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.8b, v1.8b, v0.8b
+; CHECK: movi {{v1.8b|d1}}, #{{0x0|0}}
+; CHECK-NEXT: cmhs {{v[0-9]+}}.8b, v1.8b, v0.8b
%tmp3 = icmp ule <8 x i8> %A, zeroinitializer;
%tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
ret <8 x i8> %tmp4
}
define <16 x i8> @cmlsz16xi8(<16 x i8> %A) {
+; CHECK-LABEL: cmlsz16xi8:
; Using registers other than v0, v1 are possible, but would be odd.
; LS implemented as HS, so check reversed operands.
-;CHECK: movi v1.16b, #0x0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.16b, v1.16b, v0.16b
+; CHECK: movi {{v1.16b|v1.2d}}, #{{0x0|0}}
+; CHECK-NEXT: cmhs {{v[0-9]+}}.16b, v1.16b, v0.16b
%tmp3 = icmp ule <16 x i8> %A, zeroinitializer;
%tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
ret <16 x i8> %tmp4
}
define <4 x i16> @cmlsz4xi16(<4 x i16> %A) {
+; CHECK-LABEL: cmlsz4xi16:
; Using registers other than v0, v1 are possible, but would be odd.
; LS implemented as HS, so check reversed operands.
-;CHECK: movi v1.8b, #0x0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.4h, v1.4h, v0.4h
+; CHECK: movi {{v1.8b|d1}}, #{{0x0|0}}
+; CHECK-NEXT: cmhs {{v[0-9]+}}.4h, v1.4h, v0.4h
%tmp3 = icmp ule <4 x i16> %A, zeroinitializer;
%tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
ret <4 x i16> %tmp4
}
define <8 x i16> @cmlsz8xi16(<8 x i16> %A) {
+; CHECK-LABEL: cmlsz8xi16:
; Using registers other than v0, v1 are possible, but would be odd.
; LS implemented as HS, so check reversed operands.
-;CHECK: movi v1.16b, #0x0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.8h, v1.8h, v0.8h
+; CHECK: movi {{v1.16b|v1.2d}}, #{{0x0|0}}
+; CHECK-NEXT: cmhs {{v[0-9]+}}.8h, v1.8h, v0.8h
%tmp3 = icmp ule <8 x i16> %A, zeroinitializer;
%tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
ret <8 x i16> %tmp4
}
define <2 x i32> @cmlsz2xi32(<2 x i32> %A) {
+; CHECK-LABEL: cmlsz2xi32:
; Using registers other than v0, v1 are possible, but would be odd.
; LS implemented as HS, so check reversed operands.
-;CHECK: movi v1.8b, #0x0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.2s, v1.2s, v0.2s
+; CHECK: movi {{v1.8b|d1}}, #{{0x0|0}}
+; CHECK-NEXT: cmhs {{v[0-9]+}}.2s, v1.2s, v0.2s
%tmp3 = icmp ule <2 x i32> %A, zeroinitializer;
%tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
ret <2 x i32> %tmp4
}
define <4 x i32> @cmlsz4xi32(<4 x i32> %A) {
+; CHECK-LABEL: cmlsz4xi32:
; Using registers other than v0, v1 are possible, but would be odd.
; LS implemented as HS, so check reversed operands.
-;CHECK: movi v1.16b, #0x0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.4s, v1.4s, v0.4s
+; CHECK: movi {{v1.16b|v1.2d}}, #{{0x0|0}}
+; CHECK-NEXT: cmhs {{v[0-9]+}}.4s, v1.4s, v0.4s
%tmp3 = icmp ule <4 x i32> %A, zeroinitializer;
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i64> @cmlsz2xi64(<2 x i64> %A) {
+; CHECK-LABEL: cmlsz2xi64:
; Using registers other than v0, v1 are possible, but would be odd.
; LS implemented as HS, so check reversed operands.
-;CHECK: movi v1.16b, #0x0
-;CHECK-NEXT: cmhs {{v[0-9]+}}.2d, v1.2d, v0.2d
+; CHECK: movi {{v1.16b|v1.2d}}, #{{0x0|0}}
+; CHECK-NEXT: cmhs {{v[0-9]+}}.2d, v1.2d, v0.2d
%tmp3 = icmp ule <2 x i64> %A, zeroinitializer;
%tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
ret <2 x i64> %tmp4
}
define <8 x i8> @cmloz8xi8(<8 x i8> %A) {
+; CHECK-LABEL: cmloz8xi8:
; Using registers other than v0, v1 are possible, but would be odd.
; LO implemented as HI, so check reversed operands.
-;CHECK: movi v1.8b, #0x0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.8b, v1.8b, {{v[0-9]+}}.8b
+; CHECK: movi {{v1.8b|d1}}, #{{0x0|0}}
+; CHECK-NEXT: cmhi {{v[0-9]+}}.8b, v1.8b, {{v[0-9]+}}.8b
%tmp3 = icmp ult <8 x i8> %A, zeroinitializer;
%tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
ret <8 x i8> %tmp4
}
define <16 x i8> @cmloz16xi8(<16 x i8> %A) {
+; CHECK-LABEL: cmloz16xi8:
; Using registers other than v0, v1 are possible, but would be odd.
; LO implemented as HI, so check reversed operands.
-;CHECK: movi v1.16b, #0x0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.16b, v1.16b, v0.16b
+; CHECK: movi {{v1.16b|v1.2d}}, #{{0x0|0}}
+; CHECK-NEXT: cmhi {{v[0-9]+}}.16b, v1.16b, v0.16b
%tmp3 = icmp ult <16 x i8> %A, zeroinitializer;
%tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
ret <16 x i8> %tmp4
}
define <4 x i16> @cmloz4xi16(<4 x i16> %A) {
+; CHECK-LABEL: cmloz4xi16:
; Using registers other than v0, v1 are possible, but would be odd.
; LO implemented as HI, so check reversed operands.
-;CHECK: movi v1.8b, #0x0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.4h, v1.4h, v0.4h
+; CHECK: movi {{v1.8b|d1}}, #{{0x0|0}}
+; CHECK-NEXT: cmhi {{v[0-9]+}}.4h, v1.4h, v0.4h
%tmp3 = icmp ult <4 x i16> %A, zeroinitializer;
%tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
ret <4 x i16> %tmp4
}
define <8 x i16> @cmloz8xi16(<8 x i16> %A) {
+; CHECK-LABEL: cmloz8xi16:
; Using registers other than v0, v1 are possible, but would be odd.
; LO implemented as HI, so check reversed operands.
-;CHECK: movi v1.16b, #0x0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.8h, v1.8h, v0.8h
+; CHECK: movi {{v1.16b|v1.2d}}, #{{0x0|0}}
+; CHECK-NEXT: cmhi {{v[0-9]+}}.8h, v1.8h, v0.8h
%tmp3 = icmp ult <8 x i16> %A, zeroinitializer;
%tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
ret <8 x i16> %tmp4
}
define <2 x i32> @cmloz2xi32(<2 x i32> %A) {
+; CHECK-LABEL: cmloz2xi32:
; Using registers other than v0, v1 are possible, but would be odd.
; LO implemented as HI, so check reversed operands.
-;CHECK: movi v1.8b, #0x0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.2s, v1.2s, v0.2s
+; CHECK: movi {{v1.8b|d1}}, #{{0x0|0}}
+; CHECK-NEXT: cmhi {{v[0-9]+}}.2s, v1.2s, v0.2s
%tmp3 = icmp ult <2 x i32> %A, zeroinitializer;
%tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
ret <2 x i32> %tmp4
}
define <4 x i32> @cmloz4xi32(<4 x i32> %A) {
+; CHECK-LABEL: cmloz4xi32:
; Using registers other than v0, v1 are possible, but would be odd.
; LO implemented as HI, so check reversed operands.
-;CHECK: movi v1.16b, #0x0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.4s, v1.4s, v0.4s
+; CHECK: movi {{v1.16b|v1.2d}}, #{{0x0|0}}
+; CHECK-NEXT: cmhi {{v[0-9]+}}.4s, v1.4s, v0.4s
%tmp3 = icmp ult <4 x i32> %A, zeroinitializer;
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i64> @cmloz2xi64(<2 x i64> %A) {
+; CHECK-LABEL: cmloz2xi64:
; Using registers other than v0, v1 are possible, but would be odd.
; LO implemented as HI, so check reversed operands.
-;CHECK: movi v1.16b, #0x0
-;CHECK-NEXT: cmhi {{v[0-9]+}}.2d, v1.2d, v0.2d
+; CHECK: movi {{v1.16b|v1.2d}}, #{{0x0|0}}
+; CHECK-NEXT: cmhi {{v[0-9]+}}.2d, v1.2d, v0.2d
%tmp3 = icmp ult <2 x i64> %A, zeroinitializer;
%tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
ret <2 x i64> %tmp4
@@ -1169,144 +1316,162 @@ define <2 x i64> @cmloz2xi64(<2 x i64> %A) {
define <2 x i32> @fcmoeq2xfloat(<2 x float> %A, <2 x float> %B) {
-;CHECK: fcmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK-LABEL: fcmoeq2xfloat:
+; CHECK: fcmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
%tmp3 = fcmp oeq <2 x float> %A, %B
%tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
ret <2 x i32> %tmp4
}
define <4 x i32> @fcmoeq4xfloat(<4 x float> %A, <4 x float> %B) {
-;CHECK: fcmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-LABEL: fcmoeq4xfloat:
+; CHECK: fcmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
%tmp3 = fcmp oeq <4 x float> %A, %B
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i64> @fcmoeq2xdouble(<2 x double> %A, <2 x double> %B) {
-;CHECK: fcmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK-LABEL: fcmoeq2xdouble:
+; CHECK: fcmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
%tmp3 = fcmp oeq <2 x double> %A, %B
%tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
ret <2 x i64> %tmp4
}
define <2 x i32> @fcmoge2xfloat(<2 x float> %A, <2 x float> %B) {
-;CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK-LABEL: fcmoge2xfloat:
+; CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
%tmp3 = fcmp oge <2 x float> %A, %B
%tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
ret <2 x i32> %tmp4
}
define <4 x i32> @fcmoge4xfloat(<4 x float> %A, <4 x float> %B) {
-;CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-LABEL: fcmoge4xfloat:
+; CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
%tmp3 = fcmp oge <4 x float> %A, %B
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i64> @fcmoge2xdouble(<2 x double> %A, <2 x double> %B) {
-;CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK-LABEL: fcmoge2xdouble:
+; CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
%tmp3 = fcmp oge <2 x double> %A, %B
%tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
ret <2 x i64> %tmp4
}
define <2 x i32> @fcmogt2xfloat(<2 x float> %A, <2 x float> %B) {
-;CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK-LABEL: fcmogt2xfloat:
+; CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
%tmp3 = fcmp ogt <2 x float> %A, %B
%tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
ret <2 x i32> %tmp4
}
define <4 x i32> @fcmogt4xfloat(<4 x float> %A, <4 x float> %B) {
-;CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-LABEL: fcmogt4xfloat:
+; CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
%tmp3 = fcmp ogt <4 x float> %A, %B
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i64> @fcmogt2xdouble(<2 x double> %A, <2 x double> %B) {
-;CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK-LABEL: fcmogt2xdouble:
+; CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
%tmp3 = fcmp ogt <2 x double> %A, %B
%tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
ret <2 x i64> %tmp4
}
define <2 x i32> @fcmole2xfloat(<2 x float> %A, <2 x float> %B) {
+; CHECK-LABEL: fcmole2xfloat:
; Using registers other than v0, v1 are possible, but would be odd.
; OLE implemented as OGE, so check reversed operands.
-;CHECK: fcmge {{v[0-9]+}}.2s, v1.2s, v0.2s
+; CHECK: fcmge {{v[0-9]+}}.2s, v1.2s, v0.2s
%tmp3 = fcmp ole <2 x float> %A, %B
%tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
ret <2 x i32> %tmp4
}
define <4 x i32> @fcmole4xfloat(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: fcmole4xfloat:
; Using registers other than v0, v1 are possible, but would be odd.
; OLE implemented as OGE, so check reversed operands.
-;CHECK: fcmge {{v[0-9]+}}.4s, v1.4s, v0.4s
+; CHECK: fcmge {{v[0-9]+}}.4s, v1.4s, v0.4s
%tmp3 = fcmp ole <4 x float> %A, %B
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i64> @fcmole2xdouble(<2 x double> %A, <2 x double> %B) {
+; CHECK-LABEL: fcmole2xdouble:
; Using registers other than v0, v1 are possible, but would be odd.
; OLE implemented as OGE, so check reversed operands.
-;CHECK: fcmge {{v[0-9]+}}.2d, v1.2d, v0.2d
+; CHECK: fcmge {{v[0-9]+}}.2d, v1.2d, v0.2d
%tmp3 = fcmp ole <2 x double> %A, %B
%tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
ret <2 x i64> %tmp4
}
define <2 x i32> @fcmolt2xfloat(<2 x float> %A, <2 x float> %B) {
+; CHECK-LABEL: fcmolt2xfloat:
; Using registers other than v0, v1 are possible, but would be odd.
; OLE implemented as OGE, so check reversed operands.
-;CHECK: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
+; CHECK: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
%tmp3 = fcmp olt <2 x float> %A, %B
%tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
ret <2 x i32> %tmp4
}
define <4 x i32> @fcmolt4xfloat(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: fcmolt4xfloat:
; Using registers other than v0, v1 are possible, but would be odd.
; OLE implemented as OGE, so check reversed operands.
-;CHECK: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
+; CHECK: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
%tmp3 = fcmp olt <4 x float> %A, %B
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i64> @fcmolt2xdouble(<2 x double> %A, <2 x double> %B) {
+; CHECK-LABEL: fcmolt2xdouble:
; Using registers other than v0, v1 are possible, but would be odd.
; OLE implemented as OGE, so check reversed operands.
-;CHECK: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
+; CHECK: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
%tmp3 = fcmp olt <2 x double> %A, %B
%tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
ret <2 x i64> %tmp4
}
define <2 x i32> @fcmone2xfloat(<2 x float> %A, <2 x float> %B) {
+; CHECK-LABEL: fcmone2xfloat:
; Using registers other than v0, v1 are possible, but would be odd.
; ONE = OGT | OLT, OLT implemented as OGT so check reversed operands
-;CHECK: fcmgt {{v[0-9]+}}.2s, v0.2s, v1.2s
-;CHECK-NEXT: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
-;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmgt {{v[0-9]+}}.2s, v0.2s, v1.2s
+; CHECK-NEXT: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
+; CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp3 = fcmp one <2 x float> %A, %B
%tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
ret <2 x i32> %tmp4
}
define <4 x i32> @fcmone4xfloat(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: fcmone4xfloat:
; Using registers other than v0, v1 are possible, but would be odd.
; ONE = OGT | OLT, OLT implemented as OGT so check reversed operands
-;CHECK: fcmgt {{v[0-9]+}}.4s, v0.4s, v1.4s
-;CHECK-NEXT: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
-;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmgt {{v[0-9]+}}.4s, v0.4s, v1.4s
+; CHECK-NEXT: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = fcmp one <4 x float> %A, %B
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i64> @fcmone2xdouble(<2 x double> %A, <2 x double> %B) {
+; CHECK-LABEL: fcmone2xdouble:
; Using registers other than v0, v1 are possible, but would be odd.
; ONE = OGT | OLT, OLT implemented as OGT so check reversed operands
-;CHECK: fcmgt {{v[0-9]+}}.2d, v0.2d, v1.2d
-;CHECK-NEXT: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
-;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmgt {{v[0-9]+}}.2d, v0.2d, v1.2d
+; CHECK-NEXT: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
; todo check reversed operands
%tmp3 = fcmp one <2 x double> %A, %B
%tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
@@ -1315,11 +1480,12 @@ define <2 x i64> @fcmone2xdouble(<2 x double> %A, <2 x double> %B) {
define <2 x i32> @fcmord2xfloat(<2 x float> %A, <2 x float> %B) {
+; CHECK-LABEL: fcmord2xfloat:
; Using registers other than v0, v1 are possible, but would be odd.
; ORD = OGE | OLT, OLT implemented as OGT, so check reversed operands.
-;CHECK: fcmge {{v[0-9]+}}.2s, v0.2s, v1.2s
-;CHECK-NEXT: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
-;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmge {{v[0-9]+}}.2s, v0.2s, v1.2s
+; CHECK-NEXT: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
+; CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp3 = fcmp ord <2 x float> %A, %B
%tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
ret <2 x i32> %tmp4
@@ -1327,22 +1493,24 @@ define <2 x i32> @fcmord2xfloat(<2 x float> %A, <2 x float> %B) {
define <4 x i32> @fcmord4xfloat(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: fcmord4xfloat:
; Using registers other than v0, v1 are possible, but would be odd.
; ORD = OGE | OLT, OLT implemented as OGT, so check reversed operands.
-;CHECK: fcmge {{v[0-9]+}}.4s, v0.4s, v1.4s
-;CHECK-NEXT: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
-;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmge {{v[0-9]+}}.4s, v0.4s, v1.4s
+; CHECK-NEXT: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = fcmp ord <4 x float> %A, %B
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i64> @fcmord2xdouble(<2 x double> %A, <2 x double> %B) {
+; CHECK-LABEL: fcmord2xdouble:
; Using registers other than v0, v1 are possible, but would be odd.
; ORD = OGE | OLT, OLT implemented as OGT, so check reversed operands.
-;CHECK: fcmge {{v[0-9]+}}.2d, v0.2d, v1.2d
-;CHECK-NEXT: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
-;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmge {{v[0-9]+}}.2d, v0.2d, v1.2d
+; CHECK-NEXT: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = fcmp ord <2 x double> %A, %B
%tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
ret <2 x i64> %tmp4
@@ -1350,236 +1518,260 @@ define <2 x i64> @fcmord2xdouble(<2 x double> %A, <2 x double> %B) {
define <2 x i32> @fcmuno2xfloat(<2 x float> %A, <2 x float> %B) {
+; CHECK-LABEL: fcmuno2xfloat:
; Using registers other than v0, v1 are possible, but would be odd.
; UNO = !(OGE | OLT), OLT implemented as OGT, so check reversed operands.
-;CHECK: fcmge {{v[0-9]+}}.2s, v0.2s, v1.2s
-;CHECK-NEXT: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
-;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmge {{v[0-9]+}}.2s, v0.2s, v1.2s
+; CHECK-NEXT: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
+; CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp3 = fcmp uno <2 x float> %A, %B
%tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
ret <2 x i32> %tmp4
}
define <4 x i32> @fcmuno4xfloat(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: fcmuno4xfloat:
; Using registers other than v0, v1 are possible, but would be odd.
; UNO = !(OGE | OLT), OLT implemented as OGT, so check reversed operands.
-;CHECK: fcmge {{v[0-9]+}}.4s, v0.4s, v1.4s
-;CHECK-NEXT: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
-;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmge {{v[0-9]+}}.4s, v0.4s, v1.4s
+; CHECK-NEXT: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = fcmp uno <4 x float> %A, %B
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i64> @fcmuno2xdouble(<2 x double> %A, <2 x double> %B) {
+; CHECK-LABEL: fcmuno2xdouble:
; Using registers other than v0, v1 are possible, but would be odd.
; UNO = !(OGE | OLT), OLT implemented as OGT, so check reversed operands.
-;CHECK: fcmge {{v[0-9]+}}.2d, v0.2d, v1.2d
-;CHECK-NEXT: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
-;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmge {{v[0-9]+}}.2d, v0.2d, v1.2d
+; CHECK-NEXT: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = fcmp uno <2 x double> %A, %B
%tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
ret <2 x i64> %tmp4
}
define <2 x i32> @fcmueq2xfloat(<2 x float> %A, <2 x float> %B) {
+; CHECK-LABEL: fcmueq2xfloat:
; Using registers other than v0, v1 are possible, but would be odd.
; UEQ = !ONE = !(OGT | OLT), OLT implemented as OGT so check reversed operands
-;CHECK: fcmgt {{v[0-9]+}}.2s, v0.2s, v1.2s
-;CHECK-NEXT: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
-;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmgt {{v[0-9]+}}.2s, v0.2s, v1.2s
+; CHECK-NEXT: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
+; CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp3 = fcmp ueq <2 x float> %A, %B
%tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
ret <2 x i32> %tmp4
}
define <4 x i32> @fcmueq4xfloat(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: fcmueq4xfloat:
; Using registers other than v0, v1 are possible, but would be odd.
; UEQ = !ONE = !(OGT | OLT), OLT implemented as OGT so check reversed operands
-;CHECK: fcmgt {{v[0-9]+}}.4s, v0.4s, v1.4s
-;CHECK-NEXT: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
-;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmgt {{v[0-9]+}}.4s, v0.4s, v1.4s
+; CHECK-NEXT: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = fcmp ueq <4 x float> %A, %B
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i64> @fcmueq2xdouble(<2 x double> %A, <2 x double> %B) {
+; CHECK-LABEL: fcmueq2xdouble:
; Using registers other than v0, v1 are possible, but would be odd.
; UEQ = !ONE = !(OGT | OLT), OLT implemented as OGT so check reversed operands
-;CHECK: fcmgt {{v[0-9]+}}.2d, v0.2d, v1.2d
-;CHECK-NEXT: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
-;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmgt {{v[0-9]+}}.2d, v0.2d, v1.2d
+; CHECK-NEXT: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = fcmp ueq <2 x double> %A, %B
%tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
ret <2 x i64> %tmp4
}
define <2 x i32> @fcmuge2xfloat(<2 x float> %A, <2 x float> %B) {
+; CHECK-LABEL: fcmuge2xfloat:
; Using registers other than v0, v1 are possible, but would be odd.
; UGE = ULE with swapped operands, ULE implemented as !OGT.
-;CHECK: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp3 = fcmp uge <2 x float> %A, %B
%tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
ret <2 x i32> %tmp4
}
define <4 x i32> @fcmuge4xfloat(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: fcmuge4xfloat:
; Using registers other than v0, v1 are possible, but would be odd.
; UGE = ULE with swapped operands, ULE implemented as !OGT.
-;CHECK: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = fcmp uge <4 x float> %A, %B
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i64> @fcmuge2xdouble(<2 x double> %A, <2 x double> %B) {
+; CHECK-LABEL: fcmuge2xdouble:
; Using registers other than v0, v1 are possible, but would be odd.
; UGE = ULE with swapped operands, ULE implemented as !OGT.
-;CHECK: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = fcmp uge <2 x double> %A, %B
%tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
ret <2 x i64> %tmp4
}
define <2 x i32> @fcmugt2xfloat(<2 x float> %A, <2 x float> %B) {
+; CHECK-LABEL: fcmugt2xfloat:
; Using registers other than v0, v1 are possible, but would be odd.
; UGT = ULT with swapped operands, ULT implemented as !OGE.
-;CHECK: fcmge {{v[0-9]+}}.2s, v1.2s, v0.2s
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmge {{v[0-9]+}}.2s, v1.2s, v0.2s
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp3 = fcmp ugt <2 x float> %A, %B
%tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
ret <2 x i32> %tmp4
}
define <4 x i32> @fcmugt4xfloat(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: fcmugt4xfloat:
; Using registers other than v0, v1 are possible, but would be odd.
; UGT = ULT with swapped operands, ULT implemented as !OGE.
-;CHECK: fcmge {{v[0-9]+}}.4s, v1.4s, v0.4s
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmge {{v[0-9]+}}.4s, v1.4s, v0.4s
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = fcmp ugt <4 x float> %A, %B
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i64> @fcmugt2xdouble(<2 x double> %A, <2 x double> %B) {
-;CHECK: fcmge {{v[0-9]+}}.2d, v1.2d, v0.2d
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: fcmugt2xdouble:
+; CHECK: fcmge {{v[0-9]+}}.2d, v1.2d, v0.2d
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = fcmp ugt <2 x double> %A, %B
%tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
ret <2 x i64> %tmp4
}
define <2 x i32> @fcmule2xfloat(<2 x float> %A, <2 x float> %B) {
+; CHECK-LABEL: fcmule2xfloat:
; Using registers other than v0, v1 are possible, but would be odd.
; ULE implemented as !OGT.
-;CHECK: fcmgt {{v[0-9]+}}.2s, v0.2s, v1.2s
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmgt {{v[0-9]+}}.2s, v0.2s, v1.2s
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp3 = fcmp ule <2 x float> %A, %B
%tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
ret <2 x i32> %tmp4
}
define <4 x i32> @fcmule4xfloat(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: fcmule4xfloat:
; Using registers other than v0, v1 are possible, but would be odd.
; ULE implemented as !OGT.
-;CHECK: fcmgt {{v[0-9]+}}.4s, v0.4s, v1.4s
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmgt {{v[0-9]+}}.4s, v0.4s, v1.4s
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = fcmp ule <4 x float> %A, %B
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i64> @fcmule2xdouble(<2 x double> %A, <2 x double> %B) {
+; CHECK-LABEL: fcmule2xdouble:
; Using registers other than v0, v1 are possible, but would be odd.
; ULE implemented as !OGT.
-;CHECK: fcmgt {{v[0-9]+}}.2d, v0.2d, v1.2d
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmgt {{v[0-9]+}}.2d, v0.2d, v1.2d
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = fcmp ule <2 x double> %A, %B
%tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
ret <2 x i64> %tmp4
}
define <2 x i32> @fcmult2xfloat(<2 x float> %A, <2 x float> %B) {
+; CHECK-LABEL: fcmult2xfloat:
; Using registers other than v0, v1 are possible, but would be odd.
; ULT implemented as !OGE.
-;CHECK: fcmge {{v[0-9]+}}.2s, v0.2s, v1.2s
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmge {{v[0-9]+}}.2s, v0.2s, v1.2s
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp3 = fcmp ult <2 x float> %A, %B
%tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
ret <2 x i32> %tmp4
}
define <4 x i32> @fcmult4xfloat(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: fcmult4xfloat:
; Using registers other than v0, v1 are possible, but would be odd.
; ULT implemented as !OGE.
-;CHECK: fcmge {{v[0-9]+}}.4s, v0.4s, v1.4s
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmge {{v[0-9]+}}.4s, v0.4s, v1.4s
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = fcmp ult <4 x float> %A, %B
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i64> @fcmult2xdouble(<2 x double> %A, <2 x double> %B) {
+; CHECK-LABEL: fcmult2xdouble:
; Using registers other than v0, v1 are possible, but would be odd.
; ULT implemented as !OGE.
-;CHECK: fcmge {{v[0-9]+}}.2d, v0.2d, v1.2d
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmge {{v[0-9]+}}.2d, v0.2d, v1.2d
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = fcmp ult <2 x double> %A, %B
%tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
ret <2 x i64> %tmp4
}
define <2 x i32> @fcmune2xfloat(<2 x float> %A, <2 x float> %B) {
+; CHECK-LABEL: fcmune2xfloat:
; Using registers other than v0, v1 are possible, but would be odd.
; UNE = !OEQ.
-;CHECK: fcmeq {{v[0-9]+}}.2s, v0.2s, v1.2s
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmeq {{v[0-9]+}}.2s, v0.2s, v1.2s
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp3 = fcmp une <2 x float> %A, %B
%tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
ret <2 x i32> %tmp4
}
define <4 x i32> @fcmune4xfloat(<4 x float> %A, <4 x float> %B) {
+; CHECK-LABEL: fcmune4xfloat:
; Using registers other than v0, v1 are possible, but would be odd.
; UNE = !OEQ.
-;CHECK: fcmeq {{v[0-9]+}}.4s, v0.4s, v1.4s
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmeq {{v[0-9]+}}.4s, v0.4s, v1.4s
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = fcmp une <4 x float> %A, %B
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i64> @fcmune2xdouble(<2 x double> %A, <2 x double> %B) {
+; CHECK-LABEL: fcmune2xdouble:
; Using registers other than v0, v1 are possible, but would be odd.
; UNE = !OEQ.
-;CHECK: fcmeq {{v[0-9]+}}.2d, v0.2d, v1.2d
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmeq {{v[0-9]+}}.2d, v0.2d, v1.2d
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = fcmp une <2 x double> %A, %B
%tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
ret <2 x i64> %tmp4
}
define <2 x i32> @fcmoeqz2xfloat(<2 x float> %A) {
-;CHECK: fcmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+; CHECK-LABEL: fcmoeqz2xfloat:
+; CHECK: fcmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
%tmp3 = fcmp oeq <2 x float> %A, zeroinitializer
%tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
ret <2 x i32> %tmp4
}
define <4 x i32> @fcmoeqz4xfloat(<4 x float> %A) {
-;CHECK: fcmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+; CHECK-LABEL: fcmoeqz4xfloat:
+; CHECK: fcmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
%tmp3 = fcmp oeq <4 x float> %A, zeroinitializer
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i64> @fcmoeqz2xdouble(<2 x double> %A) {
-;CHECK: fcmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+; CHECK-LABEL: fcmoeqz2xdouble:
+; CHECK: fcmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
%tmp3 = fcmp oeq <2 x double> %A, zeroinitializer
%tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
ret <2 x i64> %tmp4
@@ -1587,250 +1779,280 @@ define <2 x i64> @fcmoeqz2xdouble(<2 x double> %A) {
define <2 x i32> @fcmogez2xfloat(<2 x float> %A) {
-;CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+; CHECK-LABEL: fcmogez2xfloat:
+; CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
%tmp3 = fcmp oge <2 x float> %A, zeroinitializer
%tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
ret <2 x i32> %tmp4
}
define <4 x i32> @fcmogez4xfloat(<4 x float> %A) {
-;CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+; CHECK-LABEL: fcmogez4xfloat:
+; CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
%tmp3 = fcmp oge <4 x float> %A, zeroinitializer
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i64> @fcmogez2xdouble(<2 x double> %A) {
-;CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+; CHECK-LABEL: fcmogez2xdouble:
+; CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
%tmp3 = fcmp oge <2 x double> %A, zeroinitializer
%tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
ret <2 x i64> %tmp4
}
define <2 x i32> @fcmogtz2xfloat(<2 x float> %A) {
-;CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+; CHECK-LABEL: fcmogtz2xfloat:
+; CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
%tmp3 = fcmp ogt <2 x float> %A, zeroinitializer
%tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
ret <2 x i32> %tmp4
}
define <4 x i32> @fcmogtz4xfloat(<4 x float> %A) {
-;CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+; CHECK-LABEL: fcmogtz4xfloat:
+; CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
%tmp3 = fcmp ogt <4 x float> %A, zeroinitializer
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i64> @fcmogtz2xdouble(<2 x double> %A) {
-;CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+; CHECK-LABEL: fcmogtz2xdouble:
+; CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
%tmp3 = fcmp ogt <2 x double> %A, zeroinitializer
%tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
ret <2 x i64> %tmp4
}
define <2 x i32> @fcmoltz2xfloat(<2 x float> %A) {
-;CHECK: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+; CHECK-LABEL: fcmoltz2xfloat:
+; CHECK: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
%tmp3 = fcmp olt <2 x float> %A, zeroinitializer
%tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
ret <2 x i32> %tmp4
}
define <4 x i32> @fcmoltz4xfloat(<4 x float> %A) {
-;CHECK: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+; CHECK-LABEL: fcmoltz4xfloat:
+; CHECK: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
%tmp3 = fcmp olt <4 x float> %A, zeroinitializer
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i64> @fcmoltz2xdouble(<2 x double> %A) {
-;CHECK: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+; CHECK-LABEL: fcmoltz2xdouble:
+; CHECK: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
%tmp3 = fcmp olt <2 x double> %A, zeroinitializer
%tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
ret <2 x i64> %tmp4
}
define <2 x i32> @fcmolez2xfloat(<2 x float> %A) {
-;CHECK: fcmle {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+; CHECK-LABEL: fcmolez2xfloat:
+; CHECK: fcmle {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
%tmp3 = fcmp ole <2 x float> %A, zeroinitializer
%tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
ret <2 x i32> %tmp4
}
define <4 x i32> @fcmolez4xfloat(<4 x float> %A) {
-;CHECK: fcmle {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+; CHECK-LABEL: fcmolez4xfloat:
+; CHECK: fcmle {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
%tmp3 = fcmp ole <4 x float> %A, zeroinitializer
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i64> @fcmolez2xdouble(<2 x double> %A) {
-;CHECK: fcmle {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+; CHECK-LABEL: fcmolez2xdouble:
+; CHECK: fcmle {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
%tmp3 = fcmp ole <2 x double> %A, zeroinitializer
%tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
ret <2 x i64> %tmp4
}
define <2 x i32> @fcmonez2xfloat(<2 x float> %A) {
+; CHECK-LABEL: fcmonez2xfloat:
; ONE with zero = OLT | OGT
-;CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
-;CHECK-NEXT: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
-;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
+; CHECK-NEXT: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
+; CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp3 = fcmp one <2 x float> %A, zeroinitializer
%tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
ret <2 x i32> %tmp4
}
define <4 x i32> @fcmonez4xfloat(<4 x float> %A) {
+; CHECK-LABEL: fcmonez4xfloat:
; ONE with zero = OLT | OGT
-;CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
-;CHECK-NEXT: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
-;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
+; CHECK-NEXT: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = fcmp one <4 x float> %A, zeroinitializer
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i64> @fcmonez2xdouble(<2 x double> %A) {
+; CHECK-LABEL: fcmonez2xdouble:
; ONE with zero = OLT | OGT
-;CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
-;CHECK-NEXT: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
-;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
+; CHECK-NEXT: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = fcmp one <2 x double> %A, zeroinitializer
%tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
ret <2 x i64> %tmp4
}
define <2 x i32> @fcmordz2xfloat(<2 x float> %A) {
+; CHECK-LABEL: fcmordz2xfloat:
; ORD with zero = OLT | OGE
-;CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
-;CHECK-NEXT: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
-;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
+; CHECK-NEXT: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
+; CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp3 = fcmp ord <2 x float> %A, zeroinitializer
%tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
ret <2 x i32> %tmp4
}
define <4 x i32> @fcmordz4xfloat(<4 x float> %A) {
+; CHECK-LABEL: fcmordz4xfloat:
; ORD with zero = OLT | OGE
-;CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
-;CHECK-NEXT: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
-;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
+; CHECK-NEXT: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = fcmp ord <4 x float> %A, zeroinitializer
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i64> @fcmordz2xdouble(<2 x double> %A) {
+; CHECK-LABEL: fcmordz2xdouble:
; ORD with zero = OLT | OGE
-;CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
-;CHECK-NEXT: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
-;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
+; CHECK-NEXT: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = fcmp ord <2 x double> %A, zeroinitializer
%tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
ret <2 x i64> %tmp4
}
define <2 x i32> @fcmueqz2xfloat(<2 x float> %A) {
+; CHECK-LABEL: fcmueqz2xfloat:
; UEQ with zero = !ONE = !(OLT |OGT)
-;CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
-;CHECK-NEXT: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
-;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
+; CHECK-NEXT: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
+; CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp3 = fcmp ueq <2 x float> %A, zeroinitializer
%tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
ret <2 x i32> %tmp4
}
define <4 x i32> @fcmueqz4xfloat(<4 x float> %A) {
+; CHECK-LABEL: fcmueqz4xfloat:
; UEQ with zero = !ONE = !(OLT |OGT)
-;CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
-;CHECK-NEXT: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
-;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
+; CHECK-NEXT: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = fcmp ueq <4 x float> %A, zeroinitializer
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i64> @fcmueqz2xdouble(<2 x double> %A) {
+; CHECK-LABEL: fcmueqz2xdouble:
; UEQ with zero = !ONE = !(OLT |OGT)
-;CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
-;CHECK-NEXT: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
-;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
+; CHECK-NEXT: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = fcmp ueq <2 x double> %A, zeroinitializer
%tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
ret <2 x i64> %tmp4
}
define <2 x i32> @fcmugez2xfloat(<2 x float> %A) {
+; CHECK-LABEL: fcmugez2xfloat:
; UGE with zero = !OLT
-;CHECK: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp3 = fcmp uge <2 x float> %A, zeroinitializer
%tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
ret <2 x i32> %tmp4
}
define <4 x i32> @fcmugez4xfloat(<4 x float> %A) {
+; CHECK-LABEL: fcmugez4xfloat:
; UGE with zero = !OLT
-;CHECK: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = fcmp uge <4 x float> %A, zeroinitializer
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i64> @fcmugez2xdouble(<2 x double> %A) {
+; CHECK-LABEL: fcmugez2xdouble:
; UGE with zero = !OLT
-;CHECK: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = fcmp uge <2 x double> %A, zeroinitializer
%tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
ret <2 x i64> %tmp4
}
define <2 x i32> @fcmugtz2xfloat(<2 x float> %A) {
+; CHECK-LABEL: fcmugtz2xfloat:
; UGT with zero = !OLE
-;CHECK: fcmle {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmle {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp3 = fcmp ugt <2 x float> %A, zeroinitializer
%tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
ret <2 x i32> %tmp4
}
define <4 x i32> @fcmugtz4xfloat(<4 x float> %A) {
+; CHECK-LABEL: fcmugtz4xfloat:
; UGT with zero = !OLE
-;CHECK: fcmle {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmle {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = fcmp ugt <4 x float> %A, zeroinitializer
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i64> @fcmugtz2xdouble(<2 x double> %A) {
+; CHECK-LABEL: fcmugtz2xdouble:
; UGT with zero = !OLE
-;CHECK: fcmle {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmle {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = fcmp ugt <2 x double> %A, zeroinitializer
%tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
ret <2 x i64> %tmp4
}
define <2 x i32> @fcmultz2xfloat(<2 x float> %A) {
+; CHECK-LABEL: fcmultz2xfloat:
; ULT with zero = !OGE
-;CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp3 = fcmp ult <2 x float> %A, zeroinitializer
%tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
ret <2 x i32> %tmp4
}
define <4 x i32> @fcmultz4xfloat(<4 x float> %A) {
-;CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: fcmultz4xfloat:
+; CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = fcmp ult <4 x float> %A, zeroinitializer
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i64> @fcmultz2xdouble(<2 x double> %A) {
-;CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: fcmultz2xdouble:
+; CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = fcmp ult <2 x double> %A, zeroinitializer
%tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
ret <2 x i64> %tmp4
@@ -1838,53 +2060,59 @@ define <2 x i64> @fcmultz2xdouble(<2 x double> %A) {
define <2 x i32> @fcmulez2xfloat(<2 x float> %A) {
+; CHECK-LABEL: fcmulez2xfloat:
; ULE with zero = !OGT
-;CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp3 = fcmp ule <2 x float> %A, zeroinitializer
%tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
ret <2 x i32> %tmp4
}
define <4 x i32> @fcmulez4xfloat(<4 x float> %A) {
+; CHECK-LABEL: fcmulez4xfloat:
; ULE with zero = !OGT
-;CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = fcmp ule <4 x float> %A, zeroinitializer
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i64> @fcmulez2xdouble(<2 x double> %A) {
+; CHECK-LABEL: fcmulez2xdouble:
; ULE with zero = !OGT
-;CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = fcmp ule <2 x double> %A, zeroinitializer
%tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
ret <2 x i64> %tmp4
}
define <2 x i32> @fcmunez2xfloat(<2 x float> %A) {
+; CHECK-LABEL: fcmunez2xfloat:
; UNE with zero = !OEQ with zero
-;CHECK: fcmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp3 = fcmp une <2 x float> %A, zeroinitializer
%tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
ret <2 x i32> %tmp4
}
define <4 x i32> @fcmunez4xfloat(<4 x float> %A) {
+; CHECK-LABEL: fcmunez4xfloat:
; UNE with zero = !OEQ with zero
-;CHECK: fcmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = fcmp une <4 x float> %A, zeroinitializer
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i64> @fcmunez2xdouble(<2 x double> %A) {
+; CHECK-LABEL: fcmunez2xdouble:
; UNE with zero = !OEQ with zero
-;CHECK: fcmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = fcmp une <2 x double> %A, zeroinitializer
%tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
ret <2 x i64> %tmp4
@@ -1892,33 +2120,36 @@ define <2 x i64> @fcmunez2xdouble(<2 x double> %A) {
define <2 x i32> @fcmunoz2xfloat(<2 x float> %A) {
+; CHECK-LABEL: fcmunoz2xfloat:
; UNO with zero = !ORD = !(OLT | OGE)
-;CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
-;CHECK-NEXT: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
-;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
+; CHECK-NEXT: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0.0|0}}
+; CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
%tmp3 = fcmp uno <2 x float> %A, zeroinitializer
%tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
ret <2 x i32> %tmp4
}
define <4 x i32> @fcmunoz4xfloat(<4 x float> %A) {
+; CHECK-LABEL: fcmunoz4xfloat:
; UNO with zero = !ORD = !(OLT | OGE)
-;CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
-;CHECK-NEXT: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
-;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
+; CHECK-NEXT: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0.0|0}}
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = fcmp uno <4 x float> %A, zeroinitializer
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
ret <4 x i32> %tmp4
}
define <2 x i64> @fcmunoz2xdouble(<2 x double> %A) {
+; CHECK-LABEL: fcmunoz2xdouble:
; UNO with zero = !ORD = !(OLT | OGE)
-;CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
-;CHECK-NEXT: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
-;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
+; CHECK-NEXT: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0.0|0}}
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-NEXT: {{mvn|not}} {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
%tmp3 = fcmp uno <2 x double> %A, zeroinitializer
%tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
ret <2 x i64> %tmp4
diff --git a/test/CodeGen/AArch64/neon-copyPhysReg-tuple.ll b/test/CodeGen/AArch64/neon-copyPhysReg-tuple.ll
deleted file mode 100644
index 4dffcd1..0000000
--- a/test/CodeGen/AArch64/neon-copyPhysReg-tuple.ll
+++ /dev/null
@@ -1,47 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-define <4 x i32> @copyTuple.QPair(i8* %a, i8* %b) {
-; CHECK-LABEL: copyTuple.QPair:
-; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x{{[0-9]+|sp}}]
-entry:
- %vld = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %a, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 2, i32 2, i32 2, i32 2>, i32 0, i32 4)
- %extract = extractvalue { <4 x i32>, <4 x i32> } %vld, 0
- %vld1 = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %b, <4 x i32> %extract, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, i32 1, i32 4)
- %vld1.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld1, 0
- ret <4 x i32> %vld1.fca.0.extract
-}
-
-define <4 x i32> @copyTuple.QTriple(i8* %a, i8* %b, <4 x i32> %c) {
-; CHECK-LABEL: copyTuple.QTriple:
-; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x{{[0-9]+|sp}}]
-entry:
- %vld = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %a, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, <4 x i32> %c, i32 0, i32 4)
- %extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld, 0
- %vld1 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %b, <4 x i32> %extract, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, i32 1, i32 4)
- %vld1.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld1, 0
- ret <4 x i32> %vld1.fca.0.extract
-}
-
-define <4 x i32> @copyTuple.QQuad(i8* %a, i8* %b, <4 x i32> %c) {
-; CHECK-LABEL: copyTuple.QQuad:
-; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: orr v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x{{[0-9]+|sp}}]
-entry:
- %vld = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %a, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, <4 x i32> %c, <4 x i32> %c, i32 0, i32 4)
- %extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld, 0
- %vld1 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %b, <4 x i32> %extract, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, <4 x i32> %c, i32 1, i32 4)
- %vld1.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld1, 0
- ret <4 x i32> %vld1.fca.0.extract
-}
-
-declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32)
-declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
-declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) \ No newline at end of file
diff --git a/test/CodeGen/AArch64/neon-crypto.ll b/test/CodeGen/AArch64/neon-crypto.ll
deleted file mode 100644
index c0014fa..0000000
--- a/test/CodeGen/AArch64/neon-crypto.ll
+++ /dev/null
@@ -1,144 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -mattr=+crypto | FileCheck %s
-; RUN: not llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon 2>&1 | FileCheck --check-prefix=CHECK-NO-CRYPTO %s
-
-declare <4 x i32> @llvm.arm.neon.sha256su1(<4 x i32>, <4 x i32>, <4 x i32>) #1
-
-declare <4 x i32> @llvm.arm.neon.sha256h2(<4 x i32>, <4 x i32>, <4 x i32>) #1
-
-declare <4 x i32> @llvm.arm.neon.sha256h(<4 x i32>, <4 x i32>, <4 x i32>) #1
-
-declare <4 x i32> @llvm.arm.neon.sha1su0(<4 x i32>, <4 x i32>, <4 x i32>) #1
-
-declare <4 x i32> @llvm.arm.neon.sha1m(<4 x i32>, i32, <4 x i32>) #1
-
-declare <4 x i32> @llvm.arm.neon.sha1p(<4 x i32>, i32, <4 x i32>) #1
-
-declare <4 x i32> @llvm.arm.neon.sha1c(<4 x i32>, i32, <4 x i32>) #1
-
-declare <4 x i32> @llvm.arm.neon.sha256su0(<4 x i32>, <4 x i32>) #1
-
-declare <4 x i32> @llvm.arm.neon.sha1su1(<4 x i32>, <4 x i32>) #1
-
-declare i32 @llvm.arm.neon.sha1h(i32) #1
-
-declare <16 x i8> @llvm.arm.neon.aesimc(<16 x i8>) #1
-
-declare <16 x i8> @llvm.arm.neon.aesmc(<16 x i8>) #1
-
-declare <16 x i8> @llvm.arm.neon.aesd(<16 x i8>, <16 x i8>) #1
-
-declare <16 x i8> @llvm.arm.neon.aese(<16 x i8>, <16 x i8>) #1
-
-define <16 x i8> @test_vaeseq_u8(<16 x i8> %data, <16 x i8> %key) {
-; CHECK: test_vaeseq_u8:
-; CHECK: aese {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-; CHECK-NO-CRYPTO: Cannot select: intrinsic %llvm.arm.neon.aese
-entry:
- %aese.i = tail call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %data, <16 x i8> %key)
- ret <16 x i8> %aese.i
-}
-
-define <16 x i8> @test_vaesdq_u8(<16 x i8> %data, <16 x i8> %key) {
-; CHECK: test_vaesdq_u8:
-; CHECK: aesd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
- %aesd.i = tail call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %data, <16 x i8> %key)
- ret <16 x i8> %aesd.i
-}
-
-define <16 x i8> @test_vaesmcq_u8(<16 x i8> %data) {
-; CHECK: test_vaesmcq_u8:
-; CHECK: aesmc {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
- %aesmc.i = tail call <16 x i8> @llvm.arm.neon.aesmc(<16 x i8> %data)
- ret <16 x i8> %aesmc.i
-}
-
-define <16 x i8> @test_vaesimcq_u8(<16 x i8> %data) {
-; CHECK: test_vaesimcq_u8:
-; CHECK: aesimc {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-entry:
- %aesimc.i = tail call <16 x i8> @llvm.arm.neon.aesimc(<16 x i8> %data)
- ret <16 x i8> %aesimc.i
-}
-
-define i32 @test_vsha1h_u32(i32 %hash_e) {
-; CHECK: test_vsha1h_u32:
-; CHECK: sha1h {{s[0-9]+}}, {{s[0-9]+}}
-entry:
- %sha1h1.i = tail call i32 @llvm.arm.neon.sha1h(i32 %hash_e)
- ret i32 %sha1h1.i
-}
-
-define <4 x i32> @test_vsha1su1q_u32(<4 x i32> %tw0_3, <4 x i32> %w12_15) {
-; CHECK: test_vsha1su1q_u32:
-; CHECK: sha1su1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
- %sha1su12.i = tail call <4 x i32> @llvm.arm.neon.sha1su1(<4 x i32> %tw0_3, <4 x i32> %w12_15)
- ret <4 x i32> %sha1su12.i
-}
-
-define <4 x i32> @test_vsha256su0q_u32(<4 x i32> %w0_3, <4 x i32> %w4_7) {
-; CHECK: test_vsha256su0q_u32:
-; CHECK: sha256su0 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
- %sha256su02.i = tail call <4 x i32> @llvm.arm.neon.sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7)
- ret <4 x i32> %sha256su02.i
-}
-
-define <4 x i32> @test_vsha1cq_u32(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
-; CHECK: test_vsha1cq_u32:
-; CHECK: sha1c {{q[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.4s
-entry:
- %sha1c1.i = tail call <4 x i32> @llvm.arm.neon.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
- ret <4 x i32> %sha1c1.i
-}
-
-define <4 x i32> @test_vsha1pq_u32(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
-; CHECK: test_vsha1pq_u32:
-; CHECK: sha1p {{q[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.4s
-entry:
- %sha1p1.i = tail call <4 x i32> @llvm.arm.neon.sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
- ret <4 x i32> %sha1p1.i
-}
-
-define <4 x i32> @test_vsha1mq_u32(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
-; CHECK: test_vsha1mq_u32:
-; CHECK: sha1m {{q[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.4s
-entry:
- %sha1m1.i = tail call <4 x i32> @llvm.arm.neon.sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
- ret <4 x i32> %sha1m1.i
-}
-
-define <4 x i32> @test_vsha1su0q_u32(<4 x i32> %w0_3, <4 x i32> %w4_7, <4 x i32> %w8_11) {
-; CHECK: test_vsha1su0q_u32:
-; CHECK: sha1su0 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
- %sha1su03.i = tail call <4 x i32> @llvm.arm.neon.sha1su0(<4 x i32> %w0_3, <4 x i32> %w4_7, <4 x i32> %w8_11)
- ret <4 x i32> %sha1su03.i
-}
-
-define <4 x i32> @test_vsha256hq_u32(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk) {
-; CHECK: test_vsha256hq_u32:
-; CHECK: sha256h {{q[0-9]+}}, {{q[0-9]+}}, {{v[0-9]+}}.4s
-entry:
- %sha256h3.i = tail call <4 x i32> @llvm.arm.neon.sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk)
- ret <4 x i32> %sha256h3.i
-}
-
-define <4 x i32> @test_vsha256h2q_u32(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk) {
-; CHECK: test_vsha256h2q_u32:
-; CHECK: sha256h2 {{q[0-9]+}}, {{q[0-9]+}}, {{v[0-9]+}}.4s
-entry:
- %sha256h23.i = tail call <4 x i32> @llvm.arm.neon.sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk)
- ret <4 x i32> %sha256h23.i
-}
-
-define <4 x i32> @test_vsha256su1q_u32(<4 x i32> %tw0_3, <4 x i32> %w8_11, <4 x i32> %w12_15) {
-; CHECK: test_vsha256su1q_u32:
-; CHECK: sha256su1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
- %sha256su13.i = tail call <4 x i32> @llvm.arm.neon.sha256su1(<4 x i32> %tw0_3, <4 x i32> %w8_11, <4 x i32> %w12_15)
- ret <4 x i32> %sha256su13.i
-}
-
diff --git a/test/CodeGen/AArch64/neon-diagnostics.ll b/test/CodeGen/AArch64/neon-diagnostics.ll
index f546aa7..099b685 100644
--- a/test/CodeGen/AArch64/neon-diagnostics.ll
+++ b/test/CodeGen/AArch64/neon-diagnostics.ll
@@ -21,4 +21,4 @@ define <4 x i32> @test_vshrn_not_match(<2 x i32> %a, <2 x i64> %b) {
%shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
%4 = bitcast <2 x i64> %shuffle.i to <4 x i32>
ret <4 x i32> %4
-} \ No newline at end of file
+}
diff --git a/test/CodeGen/AArch64/neon-extract.ll b/test/CodeGen/AArch64/neon-extract.ll
index cddc226..f270b54 100644
--- a/test/CodeGen/AArch64/neon-extract.ll
+++ b/test/CodeGen/AArch64/neon-extract.ll
@@ -1,221 +1,221 @@
; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
define <8 x i8> @test_vext_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vext_s8:
-; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x2
+; CHECK-LABEL: test_vext_s8:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x2|2}}
entry:
%vext = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
ret <8 x i8> %vext
}
define <4 x i16> @test_vext_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vext_s16:
-; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x6
+; CHECK-LABEL: test_vext_s16:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x6|6}}
entry:
%vext = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
ret <4 x i16> %vext
}
define <2 x i32> @test_vext_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vext_s32:
-; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x4
+; CHECK-LABEL: test_vext_s32:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x4|4}}
entry:
%vext = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 2>
ret <2 x i32> %vext
}
define <1 x i64> @test_vext_s64(<1 x i64> %a, <1 x i64> %b) {
-; CHECK: test_vext_s64:
+; CHECK-LABEL: test_vext_s64:
entry:
%vext = shufflevector <1 x i64> %a, <1 x i64> %b, <1 x i32> <i32 0>
ret <1 x i64> %vext
}
define <16 x i8> @test_vextq_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vextq_s8:
-; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x2
+; CHECK-LABEL: test_vextq_s8:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x2|2}}
entry:
%vext = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
ret <16 x i8> %vext
}
define <8 x i16> @test_vextq_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vextq_s16:
-; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x6
+; CHECK-LABEL: test_vextq_s16:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x6|6}}
entry:
%vext = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
ret <8 x i16> %vext
}
define <4 x i32> @test_vextq_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vextq_s32:
-; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x4
+; CHECK-LABEL: test_vextq_s32:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x4|4}}
entry:
%vext = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
ret <4 x i32> %vext
}
define <2 x i64> @test_vextq_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vextq_s64:
-; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x8
+; CHECK-LABEL: test_vextq_s64:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x8|8}}
entry:
%vext = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 2>
ret <2 x i64> %vext
}
define <8 x i8> @test_vext_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vext_u8:
-; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x2
+; CHECK-LABEL: test_vext_u8:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x2|2}}
entry:
%vext = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
ret <8 x i8> %vext
}
define <4 x i16> @test_vext_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vext_u16:
-; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x6
+; CHECK-LABEL: test_vext_u16:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x6|6}}
entry:
%vext = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
ret <4 x i16> %vext
}
define <2 x i32> @test_vext_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vext_u32:
-; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x4
+; CHECK-LABEL: test_vext_u32:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x4|4}}
entry:
%vext = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 2>
ret <2 x i32> %vext
}
define <1 x i64> @test_vext_u64(<1 x i64> %a, <1 x i64> %b) {
-; CHECK: test_vext_u64:
+; CHECK-LABEL: test_vext_u64:
entry:
%vext = shufflevector <1 x i64> %a, <1 x i64> %b, <1 x i32> <i32 0>
ret <1 x i64> %vext
}
define <16 x i8> @test_vextq_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vextq_u8:
-; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x2
+; CHECK-LABEL: test_vextq_u8:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x2|2}}
entry:
%vext = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
ret <16 x i8> %vext
}
define <8 x i16> @test_vextq_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vextq_u16:
-; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x6
+; CHECK-LABEL: test_vextq_u16:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x6|6}}
entry:
%vext = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
ret <8 x i16> %vext
}
define <4 x i32> @test_vextq_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vextq_u32:
-; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x4
+; CHECK-LABEL: test_vextq_u32:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x4|4}}
entry:
%vext = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
ret <4 x i32> %vext
}
define <2 x i64> @test_vextq_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vextq_u64:
-; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x8
+; CHECK-LABEL: test_vextq_u64:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x8|8}}
entry:
%vext = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 2>
ret <2 x i64> %vext
}
define <2 x float> @test_vext_f32(<2 x float> %a, <2 x float> %b) {
-; CHECK: test_vext_f32:
-; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x4
+; CHECK-LABEL: test_vext_f32:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x4|4}}
entry:
%vext = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 2>
ret <2 x float> %vext
}
define <1 x double> @test_vext_f64(<1 x double> %a, <1 x double> %b) {
-; CHECK: test_vext_f64:
+; CHECK-LABEL: test_vext_f64:
entry:
%vext = shufflevector <1 x double> %a, <1 x double> %b, <1 x i32> <i32 0>
ret <1 x double> %vext
}
define <4 x float> @test_vextq_f32(<4 x float> %a, <4 x float> %b) {
-; CHECK: test_vextq_f32:
-; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x4
+; CHECK-LABEL: test_vextq_f32:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x4|4}}
entry:
%vext = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
ret <4 x float> %vext
}
define <2 x double> @test_vextq_f64(<2 x double> %a, <2 x double> %b) {
-; CHECK: test_vextq_f64:
-; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x8
+; CHECK-LABEL: test_vextq_f64:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x8|8}}
entry:
%vext = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 2>
ret <2 x double> %vext
}
define <8 x i8> @test_vext_p8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vext_p8:
-; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x2
+; CHECK-LABEL: test_vext_p8:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x2|2}}
entry:
%vext = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
ret <8 x i8> %vext
}
define <4 x i16> @test_vext_p16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vext_p16:
-; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x6
+; CHECK-LABEL: test_vext_p16:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x6|6}}
entry:
%vext = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
ret <4 x i16> %vext
}
define <16 x i8> @test_vextq_p8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vextq_p8:
-; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x2
+; CHECK-LABEL: test_vextq_p8:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x2|2}}
entry:
%vext = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
ret <16 x i8> %vext
}
define <8 x i16> @test_vextq_p16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vextq_p16:
-; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x6
+; CHECK-LABEL: test_vextq_p16:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x6|6}}
entry:
%vext = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
ret <8 x i16> %vext
}
define <8 x i8> @test_undef_vext_s8(<8 x i8> %a) {
-; CHECK: test_undef_vext_s8:
-; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x2
+; CHECK-LABEL: test_undef_vext_s8:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x2|2}}
entry:
%vext = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 10, i32 10, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
ret <8 x i8> %vext
}
define <16 x i8> @test_undef_vextq_s8(<16 x i8> %a) {
-; CHECK: test_undef_vextq_s8:
-; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x6
+; CHECK-LABEL: test_undef_vextq_s8:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x6|6}}
entry:
%vext = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 20, i32 20, i32 20, i32 20, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 20, i32 20, i32 20, i32 20, i32 20>
ret <16 x i8> %vext
}
define <4 x i16> @test_undef_vext_s16(<4 x i16> %a) {
-; CHECK: test_undef_vext_s16:
-; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x4
+; CHECK-LABEL: test_undef_vext_s16:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x4|4}}
entry:
%vext = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
ret <4 x i16> %vext
}
define <8 x i16> @test_undef_vextq_s16(<8 x i16> %a) {
-; CHECK: test_undef_vextq_s16:
-; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x6
+; CHECK-LABEL: test_undef_vextq_s16:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x6|6}}
entry:
%vext = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 10, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
ret <8 x i16> %vext
diff --git a/test/CodeGen/AArch64/neon-facge-facgt.ll b/test/CodeGen/AArch64/neon-facge-facgt.ll
deleted file mode 100644
index 28e8212..0000000
--- a/test/CodeGen/AArch64/neon-facge-facgt.ll
+++ /dev/null
@@ -1,56 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-declare <2 x i32> @llvm.arm.neon.vacge.v2i32.v2f32(<2 x float>, <2 x float>)
-declare <4 x i32> @llvm.arm.neon.vacge.v4i32.v4f32(<4 x float>, <4 x float>)
-declare <2 x i64> @llvm.arm.neon.vacge.v2i64.v2f64(<2 x double>, <2 x double>)
-
-define <2 x i32> @facge_from_intr_v2i32(<2 x float> %A, <2 x float> %B, <2 x float> %C) {
-; Using registers other than v0, v1 and v2 are possible, but would be odd.
-; CHECK: facge_from_intr_v2i32:
- %val = call <2 x i32> @llvm.arm.neon.vacge.v2i32.v2f32(<2 x float> %A, <2 x float> %B)
-; CHECK: facge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
- ret <2 x i32> %val
-}
-define <4 x i32> @facge_from_intr_v4i32( <4 x float> %A, <4 x float> %B) {
-; Using registers other than v0, v1 and v2 are possible, but would be odd.
-; CHECK: facge_from_intr_v4i32:
- %val = call <4 x i32> @llvm.arm.neon.vacge.v4i32.v4f32(<4 x float> %A, <4 x float> %B)
-; CHECK: facge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
- ret <4 x i32> %val
-}
-
-define <2 x i64> @facge_from_intr_v2i64(<2 x double> %A, <2 x double> %B) {
-; Using registers other than v0, v1 and v2 are possible, but would be odd.
-; CHECK: facge_from_intr_v2i64:
- %val = call <2 x i64> @llvm.arm.neon.vacge.v2i64.v2f64(<2 x double> %A, <2 x double> %B)
-; CHECK: facge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
- ret <2 x i64> %val
-}
-
-declare <2 x i32> @llvm.arm.neon.vacgt.v2i32.v2f32(<2 x float>, <2 x float>)
-declare <4 x i32> @llvm.arm.neon.vacgt.v4i32.v4f32(<4 x float>, <4 x float>)
-declare <2 x i64> @llvm.arm.neon.vacgt.v2i64.v2f64(<2 x double>, <2 x double>)
-
-define <2 x i32> @facgt_from_intr_v2i32(<2 x float> %A, <2 x float> %B, <2 x float> %C) {
-; Using registers other than v0, v1 and v2 are possible, but would be odd.
-; CHECK: facgt_from_intr_v2i32:
- %val = call <2 x i32> @llvm.arm.neon.vacgt.v2i32.v2f32(<2 x float> %A, <2 x float> %B)
-; CHECK: facgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
- ret <2 x i32> %val
-}
-define <4 x i32> @facgt_from_intr_v4i32( <4 x float> %A, <4 x float> %B) {
-; Using registers other than v0, v1 and v2 are possible, but would be odd.
-; CHECK: facgt_from_intr_v4i32:
- %val = call <4 x i32> @llvm.arm.neon.vacgt.v4i32.v4f32(<4 x float> %A, <4 x float> %B)
-; CHECK: facgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
- ret <4 x i32> %val
-}
-
-define <2 x i64> @facgt_from_intr_v2i64(<2 x double> %A, <2 x double> %B) {
-; Using registers other than v0, v1 and v2 are possible, but would be odd.
-; CHECK: facgt_from_intr_v2i64:
- %val = call <2 x i64> @llvm.arm.neon.vacgt.v2i64.v2f64(<2 x double> %A, <2 x double> %B)
-; CHECK: facgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
- ret <2 x i64> %val
-}
-
diff --git a/test/CodeGen/AArch64/neon-frsqrt-frecp.ll b/test/CodeGen/AArch64/neon-frsqrt-frecp.ll
deleted file mode 100644
index 46fe25d..0000000
--- a/test/CodeGen/AArch64/neon-frsqrt-frecp.ll
+++ /dev/null
@@ -1,54 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-; Set of tests for when the intrinsic is used.
-
-declare <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.arm.neon.vrsqrts.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @frsqrts_from_intr_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; CHECK: frsqrts v0.2s, v0.2s, v1.2s
- %val = call <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float> %lhs, <2 x float> %rhs)
- ret <2 x float> %val
-}
-
-define <4 x float> @frsqrts_from_intr_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; CHECK: frsqrts v0.4s, v0.4s, v1.4s
- %val = call <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float> %lhs, <4 x float> %rhs)
- ret <4 x float> %val
-}
-
-define <2 x double> @frsqrts_from_intr_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; CHECK: frsqrts v0.2d, v0.2d, v1.2d
- %val = call <2 x double> @llvm.arm.neon.vrsqrts.v2f64(<2 x double> %lhs, <2 x double> %rhs)
- ret <2 x double> %val
-}
-
-declare <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.arm.neon.vrecps.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @frecps_from_intr_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; CHECK: frecps v0.2s, v0.2s, v1.2s
- %val = call <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float> %lhs, <2 x float> %rhs)
- ret <2 x float> %val
-}
-
-define <4 x float> @frecps_from_intr_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; CHECK: frecps v0.4s, v0.4s, v1.4s
- %val = call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> %lhs, <4 x float> %rhs)
- ret <4 x float> %val
-}
-
-define <2 x double> @frecps_from_intr_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; CHECK: frecps v0.2d, v0.2d, v1.2d
- %val = call <2 x double> @llvm.arm.neon.vrecps.v2f64(<2 x double> %lhs, <2 x double> %rhs)
- ret <2 x double> %val
-}
-
diff --git a/test/CodeGen/AArch64/neon-halving-add-sub.ll b/test/CodeGen/AArch64/neon-halving-add-sub.ll
deleted file mode 100644
index a8f59db..0000000
--- a/test/CodeGen/AArch64/neon-halving-add-sub.ll
+++ /dev/null
@@ -1,207 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-declare <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_uhadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_uhadd_v8i8:
- %tmp1 = call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: uhadd v0.8b, v0.8b, v1.8b
- ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_shadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_shadd_v8i8:
- %tmp1 = call <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: shadd v0.8b, v0.8b, v1.8b
- ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_uhadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_uhadd_v16i8:
- %tmp1 = call <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: uhadd v0.16b, v0.16b, v1.16b
- ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_shadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_shadd_v16i8:
- %tmp1 = call <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: shadd v0.16b, v0.16b, v1.16b
- ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_uhadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_uhadd_v4i16:
- %tmp1 = call <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: uhadd v0.4h, v0.4h, v1.4h
- ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_shadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_shadd_v4i16:
- %tmp1 = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: shadd v0.4h, v0.4h, v1.4h
- ret <4 x i16> %tmp1
-}
-
-declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_uhadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_uhadd_v8i16:
- %tmp1 = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: uhadd v0.8h, v0.8h, v1.8h
- ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_shadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_shadd_v8i16:
- %tmp1 = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: shadd v0.8h, v0.8h, v1.8h
- ret <8 x i16> %tmp1
-}
-
-declare <2 x i32> @llvm.arm.neon.vhaddu.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vhadds.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_uhadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_uhadd_v2i32:
- %tmp1 = call <2 x i32> @llvm.arm.neon.vhaddu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: uhadd v0.2s, v0.2s, v1.2s
- ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_shadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_shadd_v2i32:
- %tmp1 = call <2 x i32> @llvm.arm.neon.vhadds.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: shadd v0.2s, v0.2s, v1.2s
- ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_uhadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_uhadd_v4i32:
- %tmp1 = call <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: uhadd v0.4s, v0.4s, v1.4s
- ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_shadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_shadd_v4i32:
- %tmp1 = call <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: shadd v0.4s, v0.4s, v1.4s
- ret <4 x i32> %tmp1
-}
-
-
-declare <8 x i8> @llvm.arm.neon.vhsubu.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vhsubs.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_uhsub_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_uhsub_v8i8:
- %tmp1 = call <8 x i8> @llvm.arm.neon.vhsubu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: uhsub v0.8b, v0.8b, v1.8b
- ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_shsub_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_shsub_v8i8:
- %tmp1 = call <8 x i8> @llvm.arm.neon.vhsubs.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: shsub v0.8b, v0.8b, v1.8b
- ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vhsubu.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vhsubs.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_uhsub_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_uhsub_v16i8:
- %tmp1 = call <16 x i8> @llvm.arm.neon.vhsubu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: uhsub v0.16b, v0.16b, v1.16b
- ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_shsub_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_shsub_v16i8:
- %tmp1 = call <16 x i8> @llvm.arm.neon.vhsubs.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: shsub v0.16b, v0.16b, v1.16b
- ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vhsubu.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vhsubs.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_uhsub_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_uhsub_v4i16:
- %tmp1 = call <4 x i16> @llvm.arm.neon.vhsubu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: uhsub v0.4h, v0.4h, v1.4h
- ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_shsub_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_shsub_v4i16:
- %tmp1 = call <4 x i16> @llvm.arm.neon.vhsubs.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: shsub v0.4h, v0.4h, v1.4h
- ret <4 x i16> %tmp1
-}
-
-declare <8 x i16> @llvm.arm.neon.vhsubu.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vhsubs.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_uhsub_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_uhsub_v8i16:
- %tmp1 = call <8 x i16> @llvm.arm.neon.vhsubu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: uhsub v0.8h, v0.8h, v1.8h
- ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_shsub_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_shsub_v8i16:
- %tmp1 = call <8 x i16> @llvm.arm.neon.vhsubs.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: shsub v0.8h, v0.8h, v1.8h
- ret <8 x i16> %tmp1
-}
-
-declare <2 x i32> @llvm.arm.neon.vhsubu.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vhsubs.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_uhsub_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_uhsub_v2i32:
- %tmp1 = call <2 x i32> @llvm.arm.neon.vhsubu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: uhsub v0.2s, v0.2s, v1.2s
- ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_shsub_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_shsub_v2i32:
- %tmp1 = call <2 x i32> @llvm.arm.neon.vhsubs.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: shsub v0.2s, v0.2s, v1.2s
- ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vhsubu.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vhsubs.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_uhsub_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_uhsub_v4i32:
- %tmp1 = call <4 x i32> @llvm.arm.neon.vhsubu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: uhsub v0.4s, v0.4s, v1.4s
- ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_shsub_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_shsub_v4i32:
- %tmp1 = call <4 x i32> @llvm.arm.neon.vhsubs.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: shsub v0.4s, v0.4s, v1.4s
- ret <4 x i32> %tmp1
-}
-
diff --git a/test/CodeGen/AArch64/neon-idiv.ll b/test/CodeGen/AArch64/neon-idiv.ll
new file mode 100644
index 0000000..de402c4
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-idiv.ll
@@ -0,0 +1,13 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu < %s -mattr=+neon | FileCheck %s
+
+define <4 x i32> @test1(<4 x i32> %a) {
+ %rem = srem <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
+ ret <4 x i32> %rem
+; CHECK-LABEL: test1
+; FIXME: Can we lower this more efficiently?
+; CHECK: mul
+; CHECK: mul
+; CHECK: mul
+; CHECK: mul
+}
+
diff --git a/test/CodeGen/AArch64/neon-load-store-v1i32.ll b/test/CodeGen/AArch64/neon-load-store-v1i32.ll
deleted file mode 100644
index 92f704d..0000000
--- a/test/CodeGen/AArch64/neon-load-store-v1i32.ll
+++ /dev/null
@@ -1,29 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-; Test load/store of v1i8, v1i16, v1i32 types can be selected correctly
-define void @load.store.v1i8(<1 x i8>* %ptr, <1 x i8>* %ptr2) {
-; CHECK-LABEL: load.store.v1i8:
-; CHECK: ldr b{{[0-9]+}}, [x{{[0-9]+|sp}}]
-; CHECK: str b{{[0-9]+}}, [x{{[0-9]+|sp}}]
- %a = load <1 x i8>* %ptr
- store <1 x i8> %a, <1 x i8>* %ptr2
- ret void
-}
-
-define void @load.store.v1i16(<1 x i16>* %ptr, <1 x i16>* %ptr2) {
-; CHECK-LABEL: load.store.v1i16:
-; CHECK: ldr h{{[0-9]+}}, [x{{[0-9]+|sp}}]
-; CHECK: str h{{[0-9]+}}, [x{{[0-9]+|sp}}]
- %a = load <1 x i16>* %ptr
- store <1 x i16> %a, <1 x i16>* %ptr2
- ret void
-}
-
-define void @load.store.v1i32(<1 x i32>* %ptr, <1 x i32>* %ptr2) {
-; CHECK-LABEL: load.store.v1i32:
-; CHECK: ldr s{{[0-9]+}}, [x{{[0-9]+|sp}}]
-; CHECK: str s{{[0-9]+}}, [x{{[0-9]+|sp}}]
- %a = load <1 x i32>* %ptr
- store <1 x i32> %a, <1 x i32>* %ptr2
- ret void
-}
diff --git a/test/CodeGen/AArch64/neon-max-min-pairwise.ll b/test/CodeGen/AArch64/neon-max-min-pairwise.ll
deleted file mode 100644
index 3e18077..0000000
--- a/test/CodeGen/AArch64/neon-max-min-pairwise.ll
+++ /dev/null
@@ -1,346 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-declare <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_smaxp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; CHECK: test_smaxp_v8i8:
- %tmp1 = call <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: smaxp v0.8b, v0.8b, v1.8b
- ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_umaxp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
- %tmp1 = call <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: umaxp v0.8b, v0.8b, v1.8b
- ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vpmaxs.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vpmaxu.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_smaxp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_smaxp_v16i8:
- %tmp1 = call <16 x i8> @llvm.arm.neon.vpmaxs.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: smaxp v0.16b, v0.16b, v1.16b
- ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_umaxp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_umaxp_v16i8:
- %tmp1 = call <16 x i8> @llvm.arm.neon.vpmaxu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: umaxp v0.16b, v0.16b, v1.16b
- ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_smaxp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_smaxp_v4i16:
- %tmp1 = call <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: smaxp v0.4h, v0.4h, v1.4h
- ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_umaxp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_umaxp_v4i16:
- %tmp1 = call <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: umaxp v0.4h, v0.4h, v1.4h
- ret <4 x i16> %tmp1
-}
-
-
-declare <8 x i16> @llvm.arm.neon.vpmaxs.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vpmaxu.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_smaxp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_smaxp_v8i16:
- %tmp1 = call <8 x i16> @llvm.arm.neon.vpmaxs.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: smaxp v0.8h, v0.8h, v1.8h
- ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_umaxp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_umaxp_v8i16:
- %tmp1 = call <8 x i16> @llvm.arm.neon.vpmaxu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: umaxp v0.8h, v0.8h, v1.8h
- ret <8 x i16> %tmp1
-}
-
-
-declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_smaxp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_smaxp_v2i32:
- %tmp1 = call <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: smaxp v0.2s, v0.2s, v1.2s
- ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_umaxp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_umaxp_v2i32:
- %tmp1 = call <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: umaxp v0.2s, v0.2s, v1.2s
- ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vpmaxs.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vpmaxu.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_smaxp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_smaxp_v4i32:
- %tmp1 = call <4 x i32> @llvm.arm.neon.vpmaxs.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: smaxp v0.4s, v0.4s, v1.4s
- ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_umaxp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_umaxp_v4i32:
- %tmp1 = call <4 x i32> @llvm.arm.neon.vpmaxu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: umaxp v0.4s, v0.4s, v1.4s
- ret <4 x i32> %tmp1
-}
-
-declare <8 x i8> @llvm.arm.neon.vpmins.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vpminu.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_sminp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; CHECK: test_sminp_v8i8:
- %tmp1 = call <8 x i8> @llvm.arm.neon.vpmins.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: sminp v0.8b, v0.8b, v1.8b
- ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_uminp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
- %tmp1 = call <8 x i8> @llvm.arm.neon.vpminu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: uminp v0.8b, v0.8b, v1.8b
- ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vpmins.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vpminu.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_sminp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_sminp_v16i8:
- %tmp1 = call <16 x i8> @llvm.arm.neon.vpmins.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: sminp v0.16b, v0.16b, v1.16b
- ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_uminp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_uminp_v16i8:
- %tmp1 = call <16 x i8> @llvm.arm.neon.vpminu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: uminp v0.16b, v0.16b, v1.16b
- ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vpmins.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vpminu.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_sminp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_sminp_v4i16:
- %tmp1 = call <4 x i16> @llvm.arm.neon.vpmins.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: sminp v0.4h, v0.4h, v1.4h
- ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_uminp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_uminp_v4i16:
- %tmp1 = call <4 x i16> @llvm.arm.neon.vpminu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: uminp v0.4h, v0.4h, v1.4h
- ret <4 x i16> %tmp1
-}
-
-
-declare <8 x i16> @llvm.arm.neon.vpmins.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vpminu.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_sminp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_sminp_v8i16:
- %tmp1 = call <8 x i16> @llvm.arm.neon.vpmins.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: sminp v0.8h, v0.8h, v1.8h
- ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_uminp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_uminp_v8i16:
- %tmp1 = call <8 x i16> @llvm.arm.neon.vpminu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: uminp v0.8h, v0.8h, v1.8h
- ret <8 x i16> %tmp1
-}
-
-
-declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_sminp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_sminp_v2i32:
- %tmp1 = call <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: sminp v0.2s, v0.2s, v1.2s
- ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_uminp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_uminp_v2i32:
- %tmp1 = call <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: uminp v0.2s, v0.2s, v1.2s
- ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vpmins.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vpminu.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_sminp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_sminp_v4i32:
- %tmp1 = call <4 x i32> @llvm.arm.neon.vpmins.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: sminp v0.4s, v0.4s, v1.4s
- ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_uminp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_uminp_v4i32:
- %tmp1 = call <4 x i32> @llvm.arm.neon.vpminu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: uminp v0.4s, v0.4s, v1.4s
- ret <4 x i32> %tmp1
-}
-
-declare <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.arm.neon.vpmaxs.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.arm.neon.vpmaxs.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @test_fmaxp_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; CHECK: test_fmaxp_v2f32:
- %val = call <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-; CHECK: fmaxp v0.2s, v0.2s, v1.2s
- ret <2 x float> %val
-}
-
-define <4 x float> @test_fmaxp_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; CHECK: test_fmaxp_v4f32:
- %val = call <4 x float> @llvm.arm.neon.vpmaxs.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-; CHECK: fmaxp v0.4s, v0.4s, v1.4s
- ret <4 x float> %val
-}
-
-define <2 x double> @test_fmaxp_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; CHECK: test_fmaxp_v2f64:
- %val = call <2 x double> @llvm.arm.neon.vpmaxs.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-; CHECK: fmaxp v0.2d, v0.2d, v1.2d
- ret <2 x double> %val
-}
-
-declare <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.arm.neon.vpmins.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.arm.neon.vpmins.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @test_fminp_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; CHECK: test_fminp_v2f32:
- %val = call <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-; CHECK: fminp v0.2s, v0.2s, v1.2s
- ret <2 x float> %val
-}
-
-define <4 x float> @test_fminp_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; CHECK: test_fminp_v4f32:
- %val = call <4 x float> @llvm.arm.neon.vpmins.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-; CHECK: fminp v0.4s, v0.4s, v1.4s
- ret <4 x float> %val
-}
-
-define <2 x double> @test_fminp_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; CHECK: test_fminp_v2f64:
- %val = call <2 x double> @llvm.arm.neon.vpmins.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-; CHECK: fminp v0.2d, v0.2d, v1.2d
- ret <2 x double> %val
-}
-
-declare <2 x float> @llvm.aarch64.neon.vpmaxnm.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.aarch64.neon.vpmaxnm.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.aarch64.neon.vpmaxnm.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @test_fmaxnmp_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; CHECK: test_fmaxnmp_v2f32:
- %val = call <2 x float> @llvm.aarch64.neon.vpmaxnm.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-; CHECK: fmaxnmp v0.2s, v0.2s, v1.2s
- ret <2 x float> %val
-}
-
-define <4 x float> @test_fmaxnmp_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; CHECK: test_fmaxnmp_v4f32:
- %val = call <4 x float> @llvm.aarch64.neon.vpmaxnm.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-; CHECK: fmaxnmp v0.4s, v0.4s, v1.4s
- ret <4 x float> %val
-}
-
-define <2 x double> @test_fmaxnmp_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; CHECK: test_fmaxnmp_v2f64:
- %val = call <2 x double> @llvm.aarch64.neon.vpmaxnm.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-; CHECK: fmaxnmp v0.2d, v0.2d, v1.2d
- ret <2 x double> %val
-}
-
-declare <2 x float> @llvm.aarch64.neon.vpminnm.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.aarch64.neon.vpminnm.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.aarch64.neon.vpminnm.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @test_fminnmp_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; CHECK: test_fminnmp_v2f32:
- %val = call <2 x float> @llvm.aarch64.neon.vpminnm.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-; CHECK: fminnmp v0.2s, v0.2s, v1.2s
- ret <2 x float> %val
-}
-
-define <4 x float> @test_fminnmp_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; CHECK: test_fminnmp_v4f32:
- %val = call <4 x float> @llvm.aarch64.neon.vpminnm.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-; CHECK: fminnmp v0.4s, v0.4s, v1.4s
- ret <4 x float> %val
-}
-
-define <2 x double> @test_fminnmp_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; CHECK: test_fminnmp_v2f64:
- %val = call <2 x double> @llvm.aarch64.neon.vpminnm.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-; CHECK: fminnmp v0.2d, v0.2d, v1.2d
- ret <2 x double> %val
-}
-
-define i32 @test_vminv_s32(<2 x i32> %a) {
-; CHECK-LABEL: test_vminv_s32
-; CHECK: sminp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
- %1 = tail call <1 x i32> @llvm.aarch64.neon.sminv.v1i32.v2i32(<2 x i32> %a)
- %2 = extractelement <1 x i32> %1, i32 0
- ret i32 %2
-}
-
-define i32 @test_vminv_u32(<2 x i32> %a) {
-; CHECK-LABEL: test_vminv_u32
-; CHECK: uminp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
- %1 = tail call <1 x i32> @llvm.aarch64.neon.uminv.v1i32.v2i32(<2 x i32> %a)
- %2 = extractelement <1 x i32> %1, i32 0
- ret i32 %2
-}
-
-define i32 @test_vmaxv_s32(<2 x i32> %a) {
-; CHECK-LABEL: test_vmaxv_s32
-; CHECK: smaxp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
- %1 = tail call <1 x i32> @llvm.aarch64.neon.smaxv.v1i32.v2i32(<2 x i32> %a)
- %2 = extractelement <1 x i32> %1, i32 0
- ret i32 %2
-}
-
-define i32 @test_vmaxv_u32(<2 x i32> %a) {
-; CHECK-LABEL: test_vmaxv_u32
-; CHECK: umaxp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
- %1 = tail call <1 x i32> @llvm.aarch64.neon.umaxv.v1i32.v2i32(<2 x i32> %a)
- %2 = extractelement <1 x i32> %1, i32 0
- ret i32 %2
-}
-
-declare <1 x i32> @llvm.aarch64.neon.uminv.v1i32.v2i32(<2 x i32>)
-declare <1 x i32> @llvm.aarch64.neon.sminv.v1i32.v2i32(<2 x i32>)
-declare <1 x i32> @llvm.aarch64.neon.umaxv.v1i32.v2i32(<2 x i32>)
-declare <1 x i32> @llvm.aarch64.neon.smaxv.v1i32.v2i32(<2 x i32>) \ No newline at end of file
diff --git a/test/CodeGen/AArch64/neon-max-min.ll b/test/CodeGen/AArch64/neon-max-min.ll
deleted file mode 100644
index 7889c77..0000000
--- a/test/CodeGen/AArch64/neon-max-min.ll
+++ /dev/null
@@ -1,310 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-declare <8 x i8> @llvm.arm.neon.vmaxs.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vmaxu.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_smax_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; CHECK: test_smax_v8i8:
- %tmp1 = call <8 x i8> @llvm.arm.neon.vmaxs.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: smax v0.8b, v0.8b, v1.8b
- ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_umax_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
- %tmp1 = call <8 x i8> @llvm.arm.neon.vmaxu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: umax v0.8b, v0.8b, v1.8b
- ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vmaxs.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vmaxu.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_smax_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_smax_v16i8:
- %tmp1 = call <16 x i8> @llvm.arm.neon.vmaxs.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: smax v0.16b, v0.16b, v1.16b
- ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_umax_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_umax_v16i8:
- %tmp1 = call <16 x i8> @llvm.arm.neon.vmaxu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: umax v0.16b, v0.16b, v1.16b
- ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_smax_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_smax_v4i16:
- %tmp1 = call <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: smax v0.4h, v0.4h, v1.4h
- ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_umax_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_umax_v4i16:
- %tmp1 = call <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: umax v0.4h, v0.4h, v1.4h
- ret <4 x i16> %tmp1
-}
-
-
-declare <8 x i16> @llvm.arm.neon.vmaxs.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_smax_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_smax_v8i16:
- %tmp1 = call <8 x i16> @llvm.arm.neon.vmaxs.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: smax v0.8h, v0.8h, v1.8h
- ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_umax_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_umax_v8i16:
- %tmp1 = call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: umax v0.8h, v0.8h, v1.8h
- ret <8 x i16> %tmp1
-}
-
-
-declare <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_smax_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_smax_v2i32:
- %tmp1 = call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: smax v0.2s, v0.2s, v1.2s
- ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_umax_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_umax_v2i32:
- %tmp1 = call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: umax v0.2s, v0.2s, v1.2s
- ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_smax_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_smax_v4i32:
- %tmp1 = call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: smax v0.4s, v0.4s, v1.4s
- ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_umax_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_umax_v4i32:
- %tmp1 = call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: umax v0.4s, v0.4s, v1.4s
- ret <4 x i32> %tmp1
-}
-
-declare <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_smin_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; CHECK: test_smin_v8i8:
- %tmp1 = call <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: smin v0.8b, v0.8b, v1.8b
- ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_umin_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
- %tmp1 = call <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: umin v0.8b, v0.8b, v1.8b
- ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_smin_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_smin_v16i8:
- %tmp1 = call <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: smin v0.16b, v0.16b, v1.16b
- ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_umin_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_umin_v16i8:
- %tmp1 = call <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: umin v0.16b, v0.16b, v1.16b
- ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_smin_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_smin_v4i16:
- %tmp1 = call <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: smin v0.4h, v0.4h, v1.4h
- ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_umin_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_umin_v4i16:
- %tmp1 = call <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: umin v0.4h, v0.4h, v1.4h
- ret <4 x i16> %tmp1
-}
-
-
-declare <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_smin_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_smin_v8i16:
- %tmp1 = call <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: smin v0.8h, v0.8h, v1.8h
- ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_umin_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_umin_v8i16:
- %tmp1 = call <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: umin v0.8h, v0.8h, v1.8h
- ret <8 x i16> %tmp1
-}
-
-
-declare <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_smin_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_smin_v2i32:
- %tmp1 = call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: smin v0.2s, v0.2s, v1.2s
- ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_umin_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_umin_v2i32:
- %tmp1 = call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: umin v0.2s, v0.2s, v1.2s
- ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_smin_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_smin_v4i32:
- %tmp1 = call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: smin v0.4s, v0.4s, v1.4s
- ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_umin_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_umin_v4i32:
- %tmp1 = call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: umin v0.4s, v0.4s, v1.4s
- ret <4 x i32> %tmp1
-}
-
-declare <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.arm.neon.vmaxs.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @test_fmax_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; CHECK: test_fmax_v2f32:
- %val = call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-; CHECK: fmax v0.2s, v0.2s, v1.2s
- ret <2 x float> %val
-}
-
-define <4 x float> @test_fmax_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; CHECK: test_fmax_v4f32:
- %val = call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-; CHECK: fmax v0.4s, v0.4s, v1.4s
- ret <4 x float> %val
-}
-
-define <2 x double> @test_fmax_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; CHECK: test_fmax_v2f64:
- %val = call <2 x double> @llvm.arm.neon.vmaxs.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-; CHECK: fmax v0.2d, v0.2d, v1.2d
- ret <2 x double> %val
-}
-
-declare <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.arm.neon.vmins.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @test_fmin_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; CHECK: test_fmin_v2f32:
- %val = call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-; CHECK: fmin v0.2s, v0.2s, v1.2s
- ret <2 x float> %val
-}
-
-define <4 x float> @test_fmin_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; CHECK: test_fmin_v4f32:
- %val = call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-; CHECK: fmin v0.4s, v0.4s, v1.4s
- ret <4 x float> %val
-}
-
-define <2 x double> @test_fmin_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; CHECK: test_fmin_v2f64:
- %val = call <2 x double> @llvm.arm.neon.vmins.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-; CHECK: fmin v0.2d, v0.2d, v1.2d
- ret <2 x double> %val
-}
-
-
-declare <2 x float> @llvm.aarch64.neon.vmaxnm.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.aarch64.neon.vmaxnm.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.aarch64.neon.vmaxnm.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @test_fmaxnm_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; CHECK: test_fmaxnm_v2f32:
- %val = call <2 x float> @llvm.aarch64.neon.vmaxnm.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-; CHECK: fmaxnm v0.2s, v0.2s, v1.2s
- ret <2 x float> %val
-}
-
-define <4 x float> @test_fmaxnm_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; CHECK: test_fmaxnm_v4f32:
- %val = call <4 x float> @llvm.aarch64.neon.vmaxnm.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-; CHECK: fmaxnm v0.4s, v0.4s, v1.4s
- ret <4 x float> %val
-}
-
-define <2 x double> @test_fmaxnm_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; CHECK: test_fmaxnm_v2f64:
- %val = call <2 x double> @llvm.aarch64.neon.vmaxnm.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-; CHECK: fmaxnm v0.2d, v0.2d, v1.2d
- ret <2 x double> %val
-}
-
-declare <2 x float> @llvm.aarch64.neon.vminnm.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.aarch64.neon.vminnm.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.aarch64.neon.vminnm.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @test_fminnm_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; CHECK: test_fminnm_v2f32:
- %val = call <2 x float> @llvm.aarch64.neon.vminnm.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-; CHECK: fminnm v0.2s, v0.2s, v1.2s
- ret <2 x float> %val
-}
-
-define <4 x float> @test_fminnm_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; CHECK: test_fminnm_v4f32:
- %val = call <4 x float> @llvm.aarch64.neon.vminnm.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-; CHECK: fminnm v0.4s, v0.4s, v1.4s
- ret <4 x float> %val
-}
-
-define <2 x double> @test_fminnm_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; CHECK: test_fminnm_v2f64:
- %val = call <2 x double> @llvm.aarch64.neon.vminnm.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-; CHECK: fminnm v0.2d, v0.2d, v1.2d
- ret <2 x double> %val
-}
diff --git a/test/CodeGen/AArch64/neon-misc-scalar.ll b/test/CodeGen/AArch64/neon-misc-scalar.ll
deleted file mode 100644
index cca8deb..0000000
--- a/test/CodeGen/AArch64/neon-misc-scalar.ll
+++ /dev/null
@@ -1,60 +0,0 @@
-;RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-declare <1 x i64> @llvm.arm.neon.vqneg.v1i64(<1 x i64>)
-
-declare <1 x i64> @llvm.arm.neon.vqabs.v1i64(<1 x i64>)
-
-declare <1 x i64> @llvm.arm.neon.vabs.v1i64(<1 x i64>)
-
-declare <1 x i64> @llvm.aarch64.neon.usqadd.v1i64(<1 x i64>, <1 x i64>)
-
-declare <1 x i64> @llvm.aarch64.neon.suqadd.v1i64(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_vuqadd_s64(<1 x i64> %a, <1 x i64> %b) {
-entry:
- ; CHECK: test_vuqadd_s64
- %vuqadd2.i = tail call <1 x i64> @llvm.aarch64.neon.suqadd.v1i64(<1 x i64> %a, <1 x i64> %b)
- ; CHECK: suqadd d{{[0-9]+}}, d{{[0-9]+}}
- ret <1 x i64> %vuqadd2.i
-}
-
-define <1 x i64> @test_vsqadd_u64(<1 x i64> %a, <1 x i64> %b) {
-entry:
- ; CHECK: test_vsqadd_u64
- %vsqadd2.i = tail call <1 x i64> @llvm.aarch64.neon.usqadd.v1i64(<1 x i64> %a, <1 x i64> %b)
- ; CHECK: usqadd d{{[0-9]+}}, d{{[0-9]+}}
- ret <1 x i64> %vsqadd2.i
-}
-
-define <1 x i64> @test_vabs_s64(<1 x i64> %a) {
- ; CHECK: test_vabs_s64
-entry:
- %vabs1.i = tail call <1 x i64> @llvm.arm.neon.vabs.v1i64(<1 x i64> %a)
- ; CHECK: abs d{{[0-9]+}}, d{{[0-9]+}}
- ret <1 x i64> %vabs1.i
-}
-
-define <1 x i64> @test_vqabs_s64(<1 x i64> %a) {
- ; CHECK: test_vqabs_s64
-entry:
- %vqabs1.i = tail call <1 x i64> @llvm.arm.neon.vqabs.v1i64(<1 x i64> %a)
- ; CHECK: sqabs d{{[0-9]+}}, d{{[0-9]+}}
- ret <1 x i64> %vqabs1.i
-}
-
-define <1 x i64> @test_vqneg_s64(<1 x i64> %a) {
- ; CHECK: test_vqneg_s64
-entry:
- %vqneg1.i = tail call <1 x i64> @llvm.arm.neon.vqneg.v1i64(<1 x i64> %a)
- ; CHECK: sqneg d{{[0-9]+}}, d{{[0-9]+}}
- ret <1 x i64> %vqneg1.i
-}
-
-define <1 x i64> @test_vneg_s64(<1 x i64> %a) {
- ; CHECK: test_vneg_s64
-entry:
- %sub.i = sub <1 x i64> zeroinitializer, %a
- ; CHECK: neg d{{[0-9]+}}, d{{[0-9]+}}
- ret <1 x i64> %sub.i
-}
-
diff --git a/test/CodeGen/AArch64/neon-misc.ll b/test/CodeGen/AArch64/neon-misc.ll
deleted file mode 100644
index 7ec36c2..0000000
--- a/test/CodeGen/AArch64/neon-misc.ll
+++ /dev/null
@@ -1,2014 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
-
-
-define <8 x i8> @test_vrev16_s8(<8 x i8> %a) #0 {
-; CHECK: rev16 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
- %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
- ret <8 x i8> %shuffle.i
-}
-
-define <16 x i8> @test_vrev16q_s8(<16 x i8> %a) #0 {
-; CHECK: rev16 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
- %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
- ret <16 x i8> %shuffle.i
-}
-
-define <8 x i8> @test_vrev32_s8(<8 x i8> %a) #0 {
-; CHECK: rev32 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
- %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
- ret <8 x i8> %shuffle.i
-}
-
-define <4 x i16> @test_vrev32_s16(<4 x i16> %a) #0 {
-; CHECK: rev32 v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
- %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
- ret <4 x i16> %shuffle.i
-}
-
-define <16 x i8> @test_vrev32q_s8(<16 x i8> %a) #0 {
-; CHECK: rev32 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
- %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
- ret <16 x i8> %shuffle.i
-}
-
-define <8 x i16> @test_vrev32q_s16(<8 x i16> %a) #0 {
-; CHECK: rev32 v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
- %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
- ret <8 x i16> %shuffle.i
-}
-
-define <8 x i8> @test_vrev64_s8(<8 x i8> %a) #0 {
-; CHECK: rev64 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
- %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
- ret <8 x i8> %shuffle.i
-}
-
-define <4 x i16> @test_vrev64_s16(<4 x i16> %a) #0 {
-; CHECK: rev64 v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
- %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
- ret <4 x i16> %shuffle.i
-}
-
-define <2 x i32> @test_vrev64_s32(<2 x i32> %a) #0 {
-; CHECK: rev64 v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
- %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
- ret <2 x i32> %shuffle.i
-}
-
-define <2 x float> @test_vrev64_f32(<2 x float> %a) #0 {
-; CHECK: rev64 v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
- %shuffle.i = shufflevector <2 x float> %a, <2 x float> undef, <2 x i32> <i32 1, i32 0>
- ret <2 x float> %shuffle.i
-}
-
-define <16 x i8> @test_vrev64q_s8(<16 x i8> %a) #0 {
-; CHECK: rev64 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
- %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
- ret <16 x i8> %shuffle.i
-}
-
-define <8 x i16> @test_vrev64q_s16(<8 x i16> %a) #0 {
-; CHECK: rev64 v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
- %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
- ret <8 x i16> %shuffle.i
-}
-
-define <4 x i32> @test_vrev64q_s32(<4 x i32> %a) #0 {
-; CHECK: rev64 v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
- %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
- ret <4 x i32> %shuffle.i
-}
-
-define <4 x float> @test_vrev64q_f32(<4 x float> %a) #0 {
-; CHECK: rev64 v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
- %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
- ret <4 x float> %shuffle.i
-}
-
-define <4 x i16> @test_vpaddl_s8(<8 x i8> %a) #0 {
-; CHECK: saddlp v{{[0-9]+}}.4h, v{{[0-9]+}}.8b
- %vpaddl.i = tail call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %a) #4
- ret <4 x i16> %vpaddl.i
-}
-
-define <2 x i32> @test_vpaddl_s16(<4 x i16> %a) #0 {
-; CHECK: saddlp v{{[0-9]+}}.2s, v{{[0-9]+}}.4h
- %vpaddl1.i = tail call <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16> %a) #4
- ret <2 x i32> %vpaddl1.i
-}
-
-define <1 x i64> @test_vpaddl_s32(<2 x i32> %a) #0 {
-; CHECK: saddlp v{{[0-9]+}}.1d, v{{[0-9]+}}.2s
- %vpaddl1.i = tail call <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32> %a) #4
- ret <1 x i64> %vpaddl1.i
-}
-
-define <4 x i16> @test_vpaddl_u8(<8 x i8> %a) #0 {
-; CHECK: uaddlp v{{[0-9]+}}.4h, v{{[0-9]+}}.8b
- %vpaddl.i = tail call <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8> %a) #4
- ret <4 x i16> %vpaddl.i
-}
-
-define <2 x i32> @test_vpaddl_u16(<4 x i16> %a) #0 {
-; CHECK: uaddlp v{{[0-9]+}}.2s, v{{[0-9]+}}.4h
- %vpaddl1.i = tail call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %a) #4
- ret <2 x i32> %vpaddl1.i
-}
-
-define <1 x i64> @test_vpaddl_u32(<2 x i32> %a) #0 {
-; CHECK: uaddlp v{{[0-9]+}}.1d, v{{[0-9]+}}.2s
- %vpaddl1.i = tail call <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32> %a) #4
- ret <1 x i64> %vpaddl1.i
-}
-
-define <8 x i16> @test_vpaddlq_s8(<16 x i8> %a) #0 {
-; CHECK: saddlp v{{[0-9]+}}.8h, v{{[0-9]+}}.16b
- %vpaddl.i = tail call <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8> %a) #4
- ret <8 x i16> %vpaddl.i
-}
-
-define <4 x i32> @test_vpaddlq_s16(<8 x i16> %a) #0 {
-; CHECK: saddlp v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
- %vpaddl1.i = tail call <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16> %a) #4
- ret <4 x i32> %vpaddl1.i
-}
-
-define <2 x i64> @test_vpaddlq_s32(<4 x i32> %a) #0 {
-; CHECK: saddlp v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
- %vpaddl1.i = tail call <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32> %a) #4
- ret <2 x i64> %vpaddl1.i
-}
-
-define <8 x i16> @test_vpaddlq_u8(<16 x i8> %a) #0 {
-; CHECK: uaddlp v{{[0-9]+}}.8h, v{{[0-9]+}}.16b
- %vpaddl.i = tail call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %a) #4
- ret <8 x i16> %vpaddl.i
-}
-
-define <4 x i32> @test_vpaddlq_u16(<8 x i16> %a) #0 {
-; CHECK: uaddlp v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
- %vpaddl1.i = tail call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %a) #4
- ret <4 x i32> %vpaddl1.i
-}
-
-define <2 x i64> @test_vpaddlq_u32(<4 x i32> %a) #0 {
-; CHECK: uaddlp v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
- %vpaddl1.i = tail call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a) #4
- ret <2 x i64> %vpaddl1.i
-}
-
-define <4 x i16> @test_vpadal_s8(<4 x i16> %a, <8 x i8> %b) #0 {
-; CHECK: sadalp v{{[0-9]+}}.4h, v{{[0-9]+}}.8b
- %vpadal1.i = tail call <4 x i16> @llvm.arm.neon.vpadals.v4i16.v8i8(<4 x i16> %a, <8 x i8> %b) #4
- ret <4 x i16> %vpadal1.i
-}
-
-define <2 x i32> @test_vpadal_s16(<2 x i32> %a, <4 x i16> %b) #0 {
-; CHECK: sadalp v{{[0-9]+}}.2s, v{{[0-9]+}}.4h
- %vpadal2.i = tail call <2 x i32> @llvm.arm.neon.vpadals.v2i32.v4i16(<2 x i32> %a, <4 x i16> %b) #4
- ret <2 x i32> %vpadal2.i
-}
-
-define <1 x i64> @test_vpadal_s32(<1 x i64> %a, <2 x i32> %b) #0 {
-; CHECK: sadalp v{{[0-9]+}}.1d, v{{[0-9]+}}.2s
- %vpadal2.i = tail call <1 x i64> @llvm.arm.neon.vpadals.v1i64.v2i32(<1 x i64> %a, <2 x i32> %b) #4
- ret <1 x i64> %vpadal2.i
-}
-
-define <4 x i16> @test_vpadal_u8(<4 x i16> %a, <8 x i8> %b) #0 {
-; CHECK: uadalp v{{[0-9]+}}.4h, v{{[0-9]+}}.8b
- %vpadal1.i = tail call <4 x i16> @llvm.arm.neon.vpadalu.v4i16.v8i8(<4 x i16> %a, <8 x i8> %b) #4
- ret <4 x i16> %vpadal1.i
-}
-
-define <2 x i32> @test_vpadal_u16(<2 x i32> %a, <4 x i16> %b) #0 {
-; CHECK: uadalp v{{[0-9]+}}.2s, v{{[0-9]+}}.4h
- %vpadal2.i = tail call <2 x i32> @llvm.arm.neon.vpadalu.v2i32.v4i16(<2 x i32> %a, <4 x i16> %b) #4
- ret <2 x i32> %vpadal2.i
-}
-
-define <1 x i64> @test_vpadal_u32(<1 x i64> %a, <2 x i32> %b) #0 {
-; CHECK: uadalp v{{[0-9]+}}.1d, v{{[0-9]+}}.2s
- %vpadal2.i = tail call <1 x i64> @llvm.arm.neon.vpadalu.v1i64.v2i32(<1 x i64> %a, <2 x i32> %b) #4
- ret <1 x i64> %vpadal2.i
-}
-
-define <8 x i16> @test_vpadalq_s8(<8 x i16> %a, <16 x i8> %b) #0 {
-; CHECK: sadalp v{{[0-9]+}}.8h, v{{[0-9]+}}.16b
- %vpadal1.i = tail call <8 x i16> @llvm.arm.neon.vpadals.v8i16.v16i8(<8 x i16> %a, <16 x i8> %b) #4
- ret <8 x i16> %vpadal1.i
-}
-
-define <4 x i32> @test_vpadalq_s16(<4 x i32> %a, <8 x i16> %b) #0 {
-; CHECK: sadalp v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
- %vpadal2.i = tail call <4 x i32> @llvm.arm.neon.vpadals.v4i32.v8i16(<4 x i32> %a, <8 x i16> %b) #4
- ret <4 x i32> %vpadal2.i
-}
-
-define <2 x i64> @test_vpadalq_s32(<2 x i64> %a, <4 x i32> %b) #0 {
-; CHECK: sadalp v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
- %vpadal2.i = tail call <2 x i64> @llvm.arm.neon.vpadals.v2i64.v4i32(<2 x i64> %a, <4 x i32> %b) #4
- ret <2 x i64> %vpadal2.i
-}
-
-define <8 x i16> @test_vpadalq_u8(<8 x i16> %a, <16 x i8> %b) #0 {
-; CHECK: uadalp v{{[0-9]+}}.8h, v{{[0-9]+}}.16b
- %vpadal1.i = tail call <8 x i16> @llvm.arm.neon.vpadalu.v8i16.v16i8(<8 x i16> %a, <16 x i8> %b) #4
- ret <8 x i16> %vpadal1.i
-}
-
-define <4 x i32> @test_vpadalq_u16(<4 x i32> %a, <8 x i16> %b) #0 {
-; CHECK: uadalp v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
- %vpadal2.i = tail call <4 x i32> @llvm.arm.neon.vpadalu.v4i32.v8i16(<4 x i32> %a, <8 x i16> %b) #4
- ret <4 x i32> %vpadal2.i
-}
-
-define <2 x i64> @test_vpadalq_u32(<2 x i64> %a, <4 x i32> %b) #0 {
-; CHECK: uadalp v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
- %vpadal2.i = tail call <2 x i64> @llvm.arm.neon.vpadalu.v2i64.v4i32(<2 x i64> %a, <4 x i32> %b) #4
- ret <2 x i64> %vpadal2.i
-}
-
-define <8 x i8> @test_vqabs_s8(<8 x i8> %a) #0 {
-; CHECK: sqabs v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
- %vqabs.i = tail call <8 x i8> @llvm.arm.neon.vqabs.v8i8(<8 x i8> %a) #4
- ret <8 x i8> %vqabs.i
-}
-
-define <16 x i8> @test_vqabsq_s8(<16 x i8> %a) #0 {
-; CHECK: sqabs v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
- %vqabs.i = tail call <16 x i8> @llvm.arm.neon.vqabs.v16i8(<16 x i8> %a) #4
- ret <16 x i8> %vqabs.i
-}
-
-define <4 x i16> @test_vqabs_s16(<4 x i16> %a) #0 {
-; CHECK: sqabs v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
- %vqabs1.i = tail call <4 x i16> @llvm.arm.neon.vqabs.v4i16(<4 x i16> %a) #4
- ret <4 x i16> %vqabs1.i
-}
-
-define <8 x i16> @test_vqabsq_s16(<8 x i16> %a) #0 {
-; CHECK: sqabs v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
- %vqabs1.i = tail call <8 x i16> @llvm.arm.neon.vqabs.v8i16(<8 x i16> %a) #4
- ret <8 x i16> %vqabs1.i
-}
-
-define <2 x i32> @test_vqabs_s32(<2 x i32> %a) #0 {
-; CHECK: sqabs v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
- %vqabs1.i = tail call <2 x i32> @llvm.arm.neon.vqabs.v2i32(<2 x i32> %a) #4
- ret <2 x i32> %vqabs1.i
-}
-
-define <4 x i32> @test_vqabsq_s32(<4 x i32> %a) #0 {
-; CHECK: sqabs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
- %vqabs1.i = tail call <4 x i32> @llvm.arm.neon.vqabs.v4i32(<4 x i32> %a) #4
- ret <4 x i32> %vqabs1.i
-}
-
-define <2 x i64> @test_vqabsq_s64(<2 x i64> %a) #0 {
-; CHECK: sqabs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
- %vqabs1.i = tail call <2 x i64> @llvm.arm.neon.vqabs.v2i64(<2 x i64> %a) #4
- ret <2 x i64> %vqabs1.i
-}
-
-define <8 x i8> @test_vqneg_s8(<8 x i8> %a) #0 {
-; CHECK: sqneg v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
- %vqneg.i = tail call <8 x i8> @llvm.arm.neon.vqneg.v8i8(<8 x i8> %a) #4
- ret <8 x i8> %vqneg.i
-}
-
-define <16 x i8> @test_vqnegq_s8(<16 x i8> %a) #0 {
-; CHECK: sqneg v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
- %vqneg.i = tail call <16 x i8> @llvm.arm.neon.vqneg.v16i8(<16 x i8> %a) #4
- ret <16 x i8> %vqneg.i
-}
-
-define <4 x i16> @test_vqneg_s16(<4 x i16> %a) #0 {
-; CHECK: sqneg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
- %vqneg1.i = tail call <4 x i16> @llvm.arm.neon.vqneg.v4i16(<4 x i16> %a) #4
- ret <4 x i16> %vqneg1.i
-}
-
-define <8 x i16> @test_vqnegq_s16(<8 x i16> %a) #0 {
-; CHECK: sqneg v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
- %vqneg1.i = tail call <8 x i16> @llvm.arm.neon.vqneg.v8i16(<8 x i16> %a) #4
- ret <8 x i16> %vqneg1.i
-}
-
-define <2 x i32> @test_vqneg_s32(<2 x i32> %a) #0 {
-; CHECK: sqneg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
- %vqneg1.i = tail call <2 x i32> @llvm.arm.neon.vqneg.v2i32(<2 x i32> %a) #4
- ret <2 x i32> %vqneg1.i
-}
-
-define <4 x i32> @test_vqnegq_s32(<4 x i32> %a) #0 {
-; CHECK: sqneg v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
- %vqneg1.i = tail call <4 x i32> @llvm.arm.neon.vqneg.v4i32(<4 x i32> %a) #4
- ret <4 x i32> %vqneg1.i
-}
-
-define <2 x i64> @test_vqnegq_s64(<2 x i64> %a) #0 {
-; CHECK: sqneg v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
- %vqneg1.i = tail call <2 x i64> @llvm.arm.neon.vqneg.v2i64(<2 x i64> %a) #4
- ret <2 x i64> %vqneg1.i
-}
-
-define <8 x i8> @test_vneg_s8(<8 x i8> %a) #0 {
-; CHECK: neg v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
- %sub.i = sub <8 x i8> zeroinitializer, %a
- ret <8 x i8> %sub.i
-}
-
-define <16 x i8> @test_vnegq_s8(<16 x i8> %a) #0 {
-; CHECK: neg v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
- %sub.i = sub <16 x i8> zeroinitializer, %a
- ret <16 x i8> %sub.i
-}
-
-define <4 x i16> @test_vneg_s16(<4 x i16> %a) #0 {
-; CHECK: neg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
- %sub.i = sub <4 x i16> zeroinitializer, %a
- ret <4 x i16> %sub.i
-}
-
-define <8 x i16> @test_vnegq_s16(<8 x i16> %a) #0 {
-; CHECK: neg v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
- %sub.i = sub <8 x i16> zeroinitializer, %a
- ret <8 x i16> %sub.i
-}
-
-define <2 x i32> @test_vneg_s32(<2 x i32> %a) #0 {
-; CHECK: neg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
- %sub.i = sub <2 x i32> zeroinitializer, %a
- ret <2 x i32> %sub.i
-}
-
-define <4 x i32> @test_vnegq_s32(<4 x i32> %a) #0 {
-; CHECK: neg v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
- %sub.i = sub <4 x i32> zeroinitializer, %a
- ret <4 x i32> %sub.i
-}
-
-define <2 x i64> @test_vnegq_s64(<2 x i64> %a) #0 {
-; CHECK: neg v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
- %sub.i = sub <2 x i64> zeroinitializer, %a
- ret <2 x i64> %sub.i
-}
-
-define <2 x float> @test_vneg_f32(<2 x float> %a) #0 {
-; CHECK: fneg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
- %sub.i = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %a
- ret <2 x float> %sub.i
-}
-
-define <4 x float> @test_vnegq_f32(<4 x float> %a) #0 {
-; CHECK: fneg v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
- %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
- ret <4 x float> %sub.i
-}
-
-define <2 x double> @test_vnegq_f64(<2 x double> %a) #0 {
-; CHECK: fneg v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
- %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a
- ret <2 x double> %sub.i
-}
-
-define <8 x i8> @test_vabs_s8(<8 x i8> %a) #0 {
-; CHECK: abs v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
- %vabs.i = tail call <8 x i8> @llvm.arm.neon.vabs.v8i8(<8 x i8> %a) #4
- ret <8 x i8> %vabs.i
-}
-
-define <16 x i8> @test_vabsq_s8(<16 x i8> %a) #0 {
-; CHECK: abs v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
- %vabs.i = tail call <16 x i8> @llvm.arm.neon.vabs.v16i8(<16 x i8> %a) #4
- ret <16 x i8> %vabs.i
-}
-
-define <4 x i16> @test_vabs_s16(<4 x i16> %a) #0 {
-; CHECK: abs v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
- %vabs1.i = tail call <4 x i16> @llvm.arm.neon.vabs.v4i16(<4 x i16> %a) #4
- ret <4 x i16> %vabs1.i
-}
-
-define <8 x i16> @test_vabsq_s16(<8 x i16> %a) #0 {
-; CHECK: abs v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
- %vabs1.i = tail call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %a) #4
- ret <8 x i16> %vabs1.i
-}
-
-define <2 x i32> @test_vabs_s32(<2 x i32> %a) #0 {
-; CHECK: abs v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
- %vabs1.i = tail call <2 x i32> @llvm.arm.neon.vabs.v2i32(<2 x i32> %a) #4
- ret <2 x i32> %vabs1.i
-}
-
-define <4 x i32> @test_vabsq_s32(<4 x i32> %a) #0 {
-; CHECK: abs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
- %vabs1.i = tail call <4 x i32> @llvm.arm.neon.vabs.v4i32(<4 x i32> %a) #4
- ret <4 x i32> %vabs1.i
-}
-
-define <2 x i64> @test_vabsq_s64(<2 x i64> %a) #0 {
-; CHECK: abs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
- %vabs1.i = tail call <2 x i64> @llvm.arm.neon.vabs.v2i64(<2 x i64> %a) #4
- ret <2 x i64> %vabs1.i
-}
-
-define <2 x float> @test_vabs_f32(<2 x float> %a) #1 {
-; CHECK: fabs v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
- %vabs1.i = tail call <2 x float> @llvm.fabs.v2f32(<2 x float> %a) #4
- ret <2 x float> %vabs1.i
-}
-
-define <4 x float> @test_vabsq_f32(<4 x float> %a) #1 {
-; CHECK: fabs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
- %vabs1.i = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> %a) #4
- ret <4 x float> %vabs1.i
-}
-
-define <2 x double> @test_vabsq_f64(<2 x double> %a) #1 {
-; CHECK: fabs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
- %vabs1.i = tail call <2 x double> @llvm.fabs.v2f64(<2 x double> %a) #4
- ret <2 x double> %vabs1.i
-}
-
-define <8 x i8> @test_vuqadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
-; CHECK: suqadd v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
- %vuqadd.i = tail call <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
- ret <8 x i8> %vuqadd.i
-}
-
-define <16 x i8> @test_vuqaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
-; CHECK: suqadd v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
- %vuqadd.i = tail call <16 x i8> @llvm.aarch64.neon.suqadd.v16i8(<16 x i8> %a, <16 x i8> %b) #4
- ret <16 x i8> %vuqadd.i
-}
-
-define <4 x i16> @test_vuqadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
-; CHECK: suqadd v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
- %vuqadd2.i = tail call <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16> %a, <4 x i16> %b) #4
- ret <4 x i16> %vuqadd2.i
-}
-
-define <8 x i16> @test_vuqaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
-; CHECK: suqadd v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
- %vuqadd2.i = tail call <8 x i16> @llvm.aarch64.neon.suqadd.v8i16(<8 x i16> %a, <8 x i16> %b) #4
- ret <8 x i16> %vuqadd2.i
-}
-
-define <2 x i32> @test_vuqadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
-; CHECK: suqadd v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
- %vuqadd2.i = tail call <2 x i32> @llvm.aarch64.neon.suqadd.v2i32(<2 x i32> %a, <2 x i32> %b) #4
- ret <2 x i32> %vuqadd2.i
-}
-
-define <4 x i32> @test_vuqaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
-; CHECK: suqadd v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
- %vuqadd2.i = tail call <4 x i32> @llvm.aarch64.neon.suqadd.v4i32(<4 x i32> %a, <4 x i32> %b) #4
- ret <4 x i32> %vuqadd2.i
-}
-
-define <2 x i64> @test_vuqaddq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
-; CHECK: suqadd v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
- %vuqadd2.i = tail call <2 x i64> @llvm.aarch64.neon.suqadd.v2i64(<2 x i64> %a, <2 x i64> %b) #4
- ret <2 x i64> %vuqadd2.i
-}
-
-define <8 x i8> @test_vcls_s8(<8 x i8> %a) #0 {
-; CHECK: cls v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
- %vcls.i = tail call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %a) #4
- ret <8 x i8> %vcls.i
-}
-
-define <16 x i8> @test_vclsq_s8(<16 x i8> %a) #0 {
-; CHECK: cls v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
- %vcls.i = tail call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %a) #4
- ret <16 x i8> %vcls.i
-}
-
-define <4 x i16> @test_vcls_s16(<4 x i16> %a) #0 {
-; CHECK: cls v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
- %vcls1.i = tail call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %a) #4
- ret <4 x i16> %vcls1.i
-}
-
-define <8 x i16> @test_vclsq_s16(<8 x i16> %a) #0 {
-; CHECK: cls v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
- %vcls1.i = tail call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %a) #4
- ret <8 x i16> %vcls1.i
-}
-
-define <2 x i32> @test_vcls_s32(<2 x i32> %a) #0 {
-; CHECK: cls v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
- %vcls1.i = tail call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %a) #4
- ret <2 x i32> %vcls1.i
-}
-
-define <4 x i32> @test_vclsq_s32(<4 x i32> %a) #0 {
-; CHECK: cls v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
- %vcls1.i = tail call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %a) #4
- ret <4 x i32> %vcls1.i
-}
-
-define <8 x i8> @test_vclz_s8(<8 x i8> %a) #0 {
-; CHECK: clz v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
- %vclz.i = tail call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) #4
- ret <8 x i8> %vclz.i
-}
-
-define <16 x i8> @test_vclzq_s8(<16 x i8> %a) #0 {
-; CHECK: clz v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
- %vclz.i = tail call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) #4
- ret <16 x i8> %vclz.i
-}
-
-define <4 x i16> @test_vclz_s16(<4 x i16> %a) #0 {
-; CHECK: clz v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
- %vclz1.i = tail call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false) #4
- ret <4 x i16> %vclz1.i
-}
-
-define <8 x i16> @test_vclzq_s16(<8 x i16> %a) #0 {
-; CHECK: clz v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
- %vclz1.i = tail call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false) #4
- ret <8 x i16> %vclz1.i
-}
-
-define <2 x i32> @test_vclz_s32(<2 x i32> %a) #0 {
-; CHECK: clz v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
- %vclz1.i = tail call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) #4
- ret <2 x i32> %vclz1.i
-}
-
-define <4 x i32> @test_vclzq_s32(<4 x i32> %a) #0 {
-; CHECK: clz v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
- %vclz1.i = tail call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) #4
- ret <4 x i32> %vclz1.i
-}
-
-define <8 x i8> @test_vcnt_s8(<8 x i8> %a) #0 {
-; CHECK: cnt v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
- %vctpop.i = tail call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #4
- ret <8 x i8> %vctpop.i
-}
-
-define <16 x i8> @test_vcntq_s8(<16 x i8> %a) #0 {
-; CHECK: cnt v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
- %vctpop.i = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #4
- ret <16 x i8> %vctpop.i
-}
-
-define <8 x i8> @test_vmvn_s8(<8 x i8> %a) #0 {
-; CHECK: not v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
- %neg.i = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
- ret <8 x i8> %neg.i
-}
-
-define <16 x i8> @test_vmvnq_s8(<16 x i8> %a) #0 {
-; CHECK: not v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
- %neg.i = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
- ret <16 x i8> %neg.i
-}
-
-define <4 x i16> @test_vmvn_s16(<4 x i16> %a) #0 {
-; CHECK: not v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
- %neg.i = xor <4 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1>
- ret <4 x i16> %neg.i
-}
-
-define <8 x i16> @test_vmvnq_s16(<8 x i16> %a) #0 {
-; CHECK: not v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
- %neg.i = xor <8 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
- ret <8 x i16> %neg.i
-}
-
-define <2 x i32> @test_vmvn_s32(<2 x i32> %a) #0 {
-; CHECK: not v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
- %neg.i = xor <2 x i32> %a, <i32 -1, i32 -1>
- ret <2 x i32> %neg.i
-}
-
-define <4 x i32> @test_vmvnq_s32(<4 x i32> %a) #0 {
-; CHECK: not v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
- %neg.i = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
- ret <4 x i32> %neg.i
-}
-
-define <8 x i8> @test_vrbit_s8(<8 x i8> %a) #0 {
-; CHECK: rbit v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
- %vrbit.i = tail call <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8> %a) #4
- ret <8 x i8> %vrbit.i
-}
-
-define <16 x i8> @test_vrbitq_s8(<16 x i8> %a) #0 {
-; CHECK: rbit v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
- %vrbit.i = tail call <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8> %a) #4
- ret <16 x i8> %vrbit.i
-}
-
-define <8 x i8> @test_vmovn_s16(<8 x i16> %a) #0 {
-; CHECK: xtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
- %vmovn.i = trunc <8 x i16> %a to <8 x i8>
- ret <8 x i8> %vmovn.i
-}
-
-define <4 x i16> @test_vmovn_s32(<4 x i32> %a) #0 {
-; CHECK: xtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
- %vmovn.i = trunc <4 x i32> %a to <4 x i16>
- ret <4 x i16> %vmovn.i
-}
-
-define <2 x i32> @test_vmovn_s64(<2 x i64> %a) #0 {
-; CHECK: xtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
- %vmovn.i = trunc <2 x i64> %a to <2 x i32>
- ret <2 x i32> %vmovn.i
-}
-
-define <16 x i8> @test_vmovn_high_s16(<8 x i8> %a, <8 x i16> %b) #0 {
-; CHECK: xtn2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h
- %vmovn.i.i = trunc <8 x i16> %b to <8 x i8>
- %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %vmovn.i.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- ret <16 x i8> %shuffle.i
-}
-
-define <8 x i16> @test_vmovn_high_s32(<4 x i16> %a, <4 x i32> %b) #0 {
-; CHECK: xtn2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s
- %vmovn.i.i = trunc <4 x i32> %b to <4 x i16>
- %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %vmovn.i.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- ret <8 x i16> %shuffle.i
-}
-
-define <4 x i32> @test_vmovn_high_s64(<2 x i32> %a, <2 x i64> %b) #0 {
-; CHECK: xtn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
- %vmovn.i.i = trunc <2 x i64> %b to <2 x i32>
- %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %vmovn.i.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- ret <4 x i32> %shuffle.i
-}
-
-define <8 x i8> @test_vqmovun_s16(<8 x i16> %a) #0 {
-; CHECK: sqxtun v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
- %vqdmull1.i = tail call <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16> %a) #4
- ret <8 x i8> %vqdmull1.i
-}
-
-define <4 x i16> @test_vqmovun_s32(<4 x i32> %a) #0 {
-; CHECK: sqxtun v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
- %vqdmull1.i = tail call <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32> %a) #4
- ret <4 x i16> %vqdmull1.i
-}
-
-define <2 x i32> @test_vqmovun_s64(<2 x i64> %a) #0 {
-; CHECK: sqxtun v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
- %vqdmull1.i = tail call <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64> %a) #4
- ret <2 x i32> %vqdmull1.i
-}
-
-define <16 x i8> @test_vqmovun_high_s16(<8 x i8> %a, <8 x i16> %b) #0 {
-; CHECK: sqxtun2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h
- %vqdmull1.i.i = tail call <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16> %b) #4
- %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %vqdmull1.i.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- ret <16 x i8> %shuffle.i
-}
-
-define <8 x i16> @test_vqmovun_high_s32(<4 x i16> %a, <4 x i32> %b) #0 {
-; CHECK: sqxtun2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s
- %vqdmull1.i.i = tail call <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32> %b) #4
- %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %vqdmull1.i.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- ret <8 x i16> %shuffle.i
-}
-
-define <4 x i32> @test_vqmovun_high_s64(<2 x i32> %a, <2 x i64> %b) #0 {
-; CHECK: sqxtun2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
- %vqdmull1.i.i = tail call <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64> %b) #4
- %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %vqdmull1.i.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- ret <4 x i32> %shuffle.i
-}
-
-define <8 x i8> @test_vqmovn_s16(<8 x i16> %a) #0 {
-; CHECK: sqxtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
- %vqmovn1.i = tail call <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16> %a) #4
- ret <8 x i8> %vqmovn1.i
-}
-
-define <4 x i16> @test_vqmovn_s32(<4 x i32> %a) #0 {
-; CHECK: sqxtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
- %vqmovn1.i = tail call <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32> %a) #4
- ret <4 x i16> %vqmovn1.i
-}
-
-define <2 x i32> @test_vqmovn_s64(<2 x i64> %a) #0 {
-; CHECK: sqxtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
- %vqmovn1.i = tail call <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64> %a) #4
- ret <2 x i32> %vqmovn1.i
-}
-
-define <16 x i8> @test_vqmovn_high_s16(<8 x i8> %a, <8 x i16> %b) #0 {
-; CHECK: sqxtn2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h
- %vqmovn1.i.i = tail call <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16> %b) #4
- %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %vqmovn1.i.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- ret <16 x i8> %shuffle.i
-}
-
-define <8 x i16> @test_vqmovn_high_s32(<4 x i16> %a, <4 x i32> %b) #0 {
-; CHECK: test_vqmovn_high_s32
- %vqmovn1.i.i = tail call <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32> %b) #4
- %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %vqmovn1.i.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- ret <8 x i16> %shuffle.i
-}
-
-define <4 x i32> @test_vqmovn_high_s64(<2 x i32> %a, <2 x i64> %b) #0 {
-; CHECK: test_vqmovn_high_s64
- %vqmovn1.i.i = tail call <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64> %b) #4
- %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %vqmovn1.i.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- ret <4 x i32> %shuffle.i
-}
-
-define <8 x i8> @test_vqmovn_u16(<8 x i16> %a) #0 {
-; CHECK: uqxtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
- %vqmovn1.i = tail call <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16> %a) #4
- ret <8 x i8> %vqmovn1.i
-}
-
-define <4 x i16> @test_vqmovn_u32(<4 x i32> %a) #0 {
-; CHECK: uqxtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
- %vqmovn1.i = tail call <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32> %a) #4
- ret <4 x i16> %vqmovn1.i
-}
-
-define <2 x i32> @test_vqmovn_u64(<2 x i64> %a) #0 {
-; CHECK: uqxtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
- %vqmovn1.i = tail call <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64> %a) #4
- ret <2 x i32> %vqmovn1.i
-}
-
-define <16 x i8> @test_vqmovn_high_u16(<8 x i8> %a, <8 x i16> %b) #0 {
-; CHECK: uqxtn2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h
- %vqmovn1.i.i = tail call <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16> %b) #4
- %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %vqmovn1.i.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- ret <16 x i8> %shuffle.i
-}
-
-define <8 x i16> @test_vqmovn_high_u32(<4 x i16> %a, <4 x i32> %b) #0 {
-; CHECK: uqxtn2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s
- %vqmovn1.i.i = tail call <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32> %b) #4
- %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %vqmovn1.i.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- ret <8 x i16> %shuffle.i
-}
-
-define <4 x i32> @test_vqmovn_high_u64(<2 x i32> %a, <2 x i64> %b) #0 {
-; CHECK: uqxtn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
- %vqmovn1.i.i = tail call <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64> %b) #4
- %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %vqmovn1.i.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- ret <4 x i32> %shuffle.i
-}
-
-define <8 x i16> @test_vshll_n_s8(<8 x i8> %a) #0 {
-; CHECK: shll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #8
- %1 = sext <8 x i8> %a to <8 x i16>
- %vshll_n = shl <8 x i16> %1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
- ret <8 x i16> %vshll_n
-}
-
-define <4 x i32> @test_vshll_n_s16(<4 x i16> %a) #0 {
-; CHECK: shll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #16
- %1 = sext <4 x i16> %a to <4 x i32>
- %vshll_n = shl <4 x i32> %1, <i32 16, i32 16, i32 16, i32 16>
- ret <4 x i32> %vshll_n
-}
-
-define <2 x i64> @test_vshll_n_s32(<2 x i32> %a) #0 {
-; CHECK: shll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #32
- %1 = sext <2 x i32> %a to <2 x i64>
- %vshll_n = shl <2 x i64> %1, <i64 32, i64 32>
- ret <2 x i64> %vshll_n
-}
-
-define <8 x i16> @test_vshll_n_u8(<8 x i8> %a) #0 {
-; CHECK: shll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #8
- %1 = zext <8 x i8> %a to <8 x i16>
- %vshll_n = shl <8 x i16> %1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
- ret <8 x i16> %vshll_n
-}
-
-define <4 x i32> @test_vshll_n_u16(<4 x i16> %a) #0 {
-; CHECK: shll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #16
- %1 = zext <4 x i16> %a to <4 x i32>
- %vshll_n = shl <4 x i32> %1, <i32 16, i32 16, i32 16, i32 16>
- ret <4 x i32> %vshll_n
-}
-
-define <2 x i64> @test_vshll_n_u32(<2 x i32> %a) #0 {
-; CHECK: shll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #32
- %1 = zext <2 x i32> %a to <2 x i64>
- %vshll_n = shl <2 x i64> %1, <i64 32, i64 32>
- ret <2 x i64> %vshll_n
-}
-
-define <8 x i16> @test_vshll_high_n_s8(<16 x i8> %a) #0 {
-; CHECK: shll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #8
- %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %1 = sext <8 x i8> %shuffle.i to <8 x i16>
- %vshll_n = shl <8 x i16> %1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
- ret <8 x i16> %vshll_n
-}
-
-define <4 x i32> @test_vshll_high_n_s16(<8 x i16> %a) #0 {
-; CHECK: shll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #16
- %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
- %1 = sext <4 x i16> %shuffle.i to <4 x i32>
- %vshll_n = shl <4 x i32> %1, <i32 16, i32 16, i32 16, i32 16>
- ret <4 x i32> %vshll_n
-}
-
-define <2 x i64> @test_vshll_high_n_s32(<4 x i32> %a) #0 {
-; CHECK: shll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #32
- %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
- %1 = sext <2 x i32> %shuffle.i to <2 x i64>
- %vshll_n = shl <2 x i64> %1, <i64 32, i64 32>
- ret <2 x i64> %vshll_n
-}
-
-define <8 x i16> @test_vshll_high_n_u8(<16 x i8> %a) #0 {
-; CHECK: shll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #8
- %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %1 = zext <8 x i8> %shuffle.i to <8 x i16>
- %vshll_n = shl <8 x i16> %1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
- ret <8 x i16> %vshll_n
-}
-
-define <4 x i32> @test_vshll_high_n_u16(<8 x i16> %a) #0 {
-; CHECK: shll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #16
- %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
- %1 = zext <4 x i16> %shuffle.i to <4 x i32>
- %vshll_n = shl <4 x i32> %1, <i32 16, i32 16, i32 16, i32 16>
- ret <4 x i32> %vshll_n
-}
-
-define <2 x i64> @test_vshll_high_n_u32(<4 x i32> %a) #0 {
-; CHECK: shll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #32
- %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
- %1 = zext <2 x i32> %shuffle.i to <2 x i64>
- %vshll_n = shl <2 x i64> %1, <i64 32, i64 32>
- ret <2 x i64> %vshll_n
-}
-
-define <4 x i16> @test_vcvt_f16_f32(<4 x float> %a) #0 {
-; CHECK: fcvtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
- %vcvt1.i = tail call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %a) #4
- ret <4 x i16> %vcvt1.i
-}
-
-define <8 x i16> @test_vcvt_high_f16_f32(<4 x i16> %a, <4 x float> %b) #0 {
-; CHECK: fcvtn2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s
- %vcvt1.i.i = tail call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %b) #4
- %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %vcvt1.i.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- ret <8 x i16> %shuffle.i
-}
-
-define <4 x float> @test_vcvt_f32_f16(<4 x i16> %a) #0 {
-; CHECK: fcvtl v{{[0-9]+}}.4s, v{{[0-9]+}}.4h
- %vcvt1.i = tail call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %a) #4
- ret <4 x float> %vcvt1.i
-}
-
-define <4 x float> @test_vcvt_high_f32_f16(<8 x i16> %a) #0 {
-; CHECK: fcvtl2 v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
- %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
- %vcvt1.i.i = tail call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %shuffle.i.i) #4
- ret <4 x float> %vcvt1.i.i
-}
-
-define <2 x float> @test_vcvt_f32_f64(<2 x double> %a) #0 {
-; CHECK: fcvtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
- %vcvt.i = fptrunc <2 x double> %a to <2 x float>
- ret <2 x float> %vcvt.i
-}
-
-define <4 x float> @test_vcvt_high_f32_f64(<2 x float> %a, <2 x double> %b) #0 {
-; CHECK: fcvtn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
- %vcvt.i.i = fptrunc <2 x double> %b to <2 x float>
- %shuffle.i = shufflevector <2 x float> %a, <2 x float> %vcvt.i.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- ret <4 x float> %shuffle.i
-}
-
-define <2 x float> @test_vcvtx_f32_f64(<2 x double> %a) #0 {
-; CHECK: fcvtxn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
- %vcvtx_f32_f641.i = call <2 x float> @llvm.aarch64.neon.vcvtxn.v2f32.v2f64(<2 x double> %a) #4
- ret <2 x float> %vcvtx_f32_f641.i
-}
-
-define <4 x float> @test_vcvtx_high_f32_f64(<2 x float> %a, <2 x double> %b) #0 {
-; CHECK: fcvtxn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
- %vcvtx_f32_f641.i.i = tail call <2 x float> @llvm.aarch64.neon.vcvtxn.v2f32.v2f64(<2 x double> %b) #4
- %shuffle.i = shufflevector <2 x float> %a, <2 x float> %vcvtx_f32_f641.i.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- ret <4 x float> %shuffle.i
-}
-
-define <2 x double> @test_vcvt_f64_f32(<2 x float> %a) #0 {
-; CHECK: fcvtl v{{[0-9]+}}.2d, v{{[0-9]+}}.2s
- %vcvt.i = fpext <2 x float> %a to <2 x double>
- ret <2 x double> %vcvt.i
-}
-
-define <2 x double> @test_vcvt_high_f64_f32(<4 x float> %a) #0 {
-; CHECK: fcvtl2 v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
- %shuffle.i.i = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 2, i32 3>
- %vcvt.i.i = fpext <2 x float> %shuffle.i.i to <2 x double>
- ret <2 x double> %vcvt.i.i
-}
-
-define <2 x float> @test_vrndn_f32(<2 x float> %a) #0 {
-; CHECK: frintn v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
- %vrndn1.i = tail call <2 x float> @llvm.aarch64.neon.frintn.v2f32(<2 x float> %a) #4
- ret <2 x float> %vrndn1.i
-}
-
-define <4 x float> @test_vrndnq_f32(<4 x float> %a) #0 {
-; CHECK: frintn v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
- %vrndn1.i = tail call <4 x float> @llvm.aarch64.neon.frintn.v4f32(<4 x float> %a) #4
- ret <4 x float> %vrndn1.i
-}
-
-define <2 x double> @test_vrndnq_f64(<2 x double> %a) #0 {
-; CHECK: frintn v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
- %vrndn1.i = tail call <2 x double> @llvm.aarch64.neon.frintn.v2f64(<2 x double> %a) #4
- ret <2 x double> %vrndn1.i
-}
-
-define <2 x float> @test_vrnda_f32(<2 x float> %a) #0 {
-; CHECK: frinta v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
- %vrnda1.i = tail call <2 x float> @llvm.round.v2f32(<2 x float> %a) #4
- ret <2 x float> %vrnda1.i
-}
-
-define <4 x float> @test_vrndaq_f32(<4 x float> %a) #0 {
-; CHECK: frinta v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
- %vrnda1.i = tail call <4 x float> @llvm.round.v4f32(<4 x float> %a) #4
- ret <4 x float> %vrnda1.i
-}
-
-define <2 x double> @test_vrndaq_f64(<2 x double> %a) #0 {
-; CHECK: frinta v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
- %vrnda1.i = tail call <2 x double> @llvm.round.v2f64(<2 x double> %a) #4
- ret <2 x double> %vrnda1.i
-}
-
-define <2 x float> @test_vrndp_f32(<2 x float> %a) #0 {
-; CHECK: frintp v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
- %vrndp1.i = tail call <2 x float> @llvm.ceil.v2f32(<2 x float> %a) #4
- ret <2 x float> %vrndp1.i
-}
-
-define <4 x float> @test_vrndpq_f32(<4 x float> %a) #0 {
-; CHECK: frintp v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
- %vrndp1.i = tail call <4 x float> @llvm.ceil.v4f32(<4 x float> %a) #4
- ret <4 x float> %vrndp1.i
-}
-
-define <2 x double> @test_vrndpq_f64(<2 x double> %a) #0 {
-; CHECK: frintp v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
- %vrndp1.i = tail call <2 x double> @llvm.ceil.v2f64(<2 x double> %a) #4
- ret <2 x double> %vrndp1.i
-}
-
-define <2 x float> @test_vrndm_f32(<2 x float> %a) #0 {
-; CHECK: frintm v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
- %vrndm1.i = tail call <2 x float> @llvm.floor.v2f32(<2 x float> %a) #4
- ret <2 x float> %vrndm1.i
-}
-
-define <4 x float> @test_vrndmq_f32(<4 x float> %a) #0 {
-; CHECK: frintm v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
- %vrndm1.i = tail call <4 x float> @llvm.floor.v4f32(<4 x float> %a) #4
- ret <4 x float> %vrndm1.i
-}
-
-define <2 x double> @test_vrndmq_f64(<2 x double> %a) #0 {
-; CHECK: frintm v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
- %vrndm1.i = tail call <2 x double> @llvm.floor.v2f64(<2 x double> %a) #4
- ret <2 x double> %vrndm1.i
-}
-
-define <2 x float> @test_vrndx_f32(<2 x float> %a) #0 {
-; CHECK: frintx v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
- %vrndx1.i = tail call <2 x float> @llvm.rint.v2f32(<2 x float> %a) #4
- ret <2 x float> %vrndx1.i
-}
-
-define <4 x float> @test_vrndxq_f32(<4 x float> %a) #0 {
-; CHECK: frintx v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
- %vrndx1.i = tail call <4 x float> @llvm.rint.v4f32(<4 x float> %a) #4
- ret <4 x float> %vrndx1.i
-}
-
-define <2 x double> @test_vrndxq_f64(<2 x double> %a) #0 {
-; CHECK: frintx v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
- %vrndx1.i = tail call <2 x double> @llvm.rint.v2f64(<2 x double> %a) #4
- ret <2 x double> %vrndx1.i
-}
-
-define <2 x float> @test_vrnd_f32(<2 x float> %a) #0 {
-; CHECK: frintz v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
- %vrnd1.i = tail call <2 x float> @llvm.trunc.v2f32(<2 x float> %a) #4
- ret <2 x float> %vrnd1.i
-}
-
-define <4 x float> @test_vrndq_f32(<4 x float> %a) #0 {
-; CHECK: frintz v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
- %vrnd1.i = tail call <4 x float> @llvm.trunc.v4f32(<4 x float> %a) #4
- ret <4 x float> %vrnd1.i
-}
-
-define <2 x double> @test_vrndq_f64(<2 x double> %a) #0 {
-; CHECK: frintz v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
- %vrnd1.i = tail call <2 x double> @llvm.trunc.v2f64(<2 x double> %a) #4
- ret <2 x double> %vrnd1.i
-}
-
-define <2 x float> @test_vrndi_f32(<2 x float> %a) #0 {
-; CHECK: frinti v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
- %vrndi1.i = tail call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %a) #4
- ret <2 x float> %vrndi1.i
-}
-
-define <4 x float> @test_vrndiq_f32(<4 x float> %a) #0 {
-; CHECK: frinti v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
- %vrndi1.i = tail call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a) #4
- ret <4 x float> %vrndi1.i
-}
-
-define <2 x double> @test_vrndiq_f64(<2 x double> %a) #0 {
-; CHECK: frinti v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
- %vrndi1.i = tail call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a) #4
- ret <2 x double> %vrndi1.i
-}
-
-define <2 x i32> @test_vcvt_s32_f32(<2 x float> %a) #0 {
-; CHECK: fcvtzs v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
- %vcvt.i = fptosi <2 x float> %a to <2 x i32>
- ret <2 x i32> %vcvt.i
-}
-
-define <4 x i32> @test_vcvtq_s32_f32(<4 x float> %a) #0 {
-; CHECK: fcvtzs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
- %vcvt.i = fptosi <4 x float> %a to <4 x i32>
- ret <4 x i32> %vcvt.i
-}
-
-define <2 x i64> @test_vcvtq_s64_f64(<2 x double> %a) #0 {
-; CHECK: fcvtzs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
- %vcvt.i = fptosi <2 x double> %a to <2 x i64>
- ret <2 x i64> %vcvt.i
-}
-
-define <2 x i32> @test_vcvt_u32_f32(<2 x float> %a) #0 {
-; CHECK: fcvtzu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
- %vcvt.i = fptoui <2 x float> %a to <2 x i32>
- ret <2 x i32> %vcvt.i
-}
-
-define <4 x i32> @test_vcvtq_u32_f32(<4 x float> %a) #0 {
-; CHECK: fcvtzu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
- %vcvt.i = fptoui <4 x float> %a to <4 x i32>
- ret <4 x i32> %vcvt.i
-}
-
-define <2 x i64> @test_vcvtq_u64_f64(<2 x double> %a) #0 {
-; CHECK: fcvtzu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
- %vcvt.i = fptoui <2 x double> %a to <2 x i64>
- ret <2 x i64> %vcvt.i
-}
-
-define <2 x i64> @test_vcvt_s64_f32(<2 x float> %a) #0 {
-; CHECK: fcvtl v{{[0-9]+}}.2d, v{{[0-9]+}}.2s
-; CHECK: fcvtzs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
- %vcvt.i = fptosi <2 x float> %a to <2 x i64>
- ret <2 x i64> %vcvt.i
-}
-
-define <2 x i64> @test_vcvt_u64_f32(<2 x float> %a) #0 {
-; CHECK: fcvtl v{{[0-9]+}}.2d, v{{[0-9]+}}.2s
-; CHECK: fcvtzu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
- %vcvt.i = fptoui <2 x float> %a to <2 x i64>
- ret <2 x i64> %vcvt.i
-}
-
-define <4 x i16> @test_vcvt_s16_f32(<4 x float> %a) #0 {
-; CHECK: fcvtzs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-; CHECK: xtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
- %vcvt.i = fptosi <4 x float> %a to <4 x i16>
- ret <4 x i16> %vcvt.i
-}
-
-define <4 x i16> @test_vcvt_u16_f32(<4 x float> %a) #0 {
-; CHECK: fcvtzu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-; CHECK: xtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
- %vcvt.i = fptoui <4 x float> %a to <4 x i16>
- ret <4 x i16> %vcvt.i
-}
-
-define <2 x i32> @test_vcvt_s32_f64(<2 x double> %a) #0 {
-; CHECK: fcvtzs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-; CHECK: xtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
- %vcvt.i = fptosi <2 x double> %a to <2 x i32>
- ret <2 x i32> %vcvt.i
-}
-
-define <2 x i32> @test_vcvt_u32_f64(<2 x double> %a) #0 {
-; CHECK: fcvtzu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-; CHECK: xtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
- %vcvt.i = fptoui <2 x double> %a to <2 x i32>
- ret <2 x i32> %vcvt.i
-}
-
-define <1 x i8> @test_vcvt_s8_f64(<1 x double> %a) #0 {
-; CHECK: fcvtzs w{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: ins v{{[0-9]+}}.b[0], w{{[0-9]+}}
- %vcvt.i = fptosi <1 x double> %a to <1 x i8>
- ret <1 x i8> %vcvt.i
-}
-
-define <1 x i8> @test_vcvt_u8_f64(<1 x double> %a) #0 {
-; CHECK: fcvtzs w{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: ins v{{[0-9]+}}.b[0], w{{[0-9]+}}
- %vcvt.i = fptoui <1 x double> %a to <1 x i8>
- ret <1 x i8> %vcvt.i
-}
-
-define <1 x i16> @test_vcvt_s16_f64(<1 x double> %a) #0 {
-; CHECK: fcvtzs w{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: ins v{{[0-9]+}}.h[0], w{{[0-9]+}}
- %vcvt.i = fptosi <1 x double> %a to <1 x i16>
- ret <1 x i16> %vcvt.i
-}
-
-define <1 x i16> @test_vcvt_u16_f64(<1 x double> %a) #0 {
-; CHECK: fcvtzs w{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: ins v{{[0-9]+}}.h[0], w{{[0-9]+}}
- %vcvt.i = fptoui <1 x double> %a to <1 x i16>
- ret <1 x i16> %vcvt.i
-}
-
-define <1 x i32> @test_vcvt_s32_f64_v1(<1 x double> %a) #0 {
-; CHECK: fcvtzs w{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: fmov s{{[0-9]+}}, w{{[0-9]+}}
- %vcvt.i = fptosi <1 x double> %a to <1 x i32>
- ret <1 x i32> %vcvt.i
-}
-
-define <1 x i32> @test_vcvt_u32_f64_v1(<1 x double> %a) #0 {
-; CHECK: fcvtzu w{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: fmov s{{[0-9]+}}, w{{[0-9]+}}
- %vcvt.i = fptoui <1 x double> %a to <1 x i32>
- ret <1 x i32> %vcvt.i
-}
-
-define <2 x i32> @test_vcvtn_s32_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vcvtn_s32_f32
-; CHECK: fcvtns v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
- %vcvtns_f321.i = call <2 x i32> @llvm.arm.neon.vcvtns.v2i32.v2f32(<2 x float> %a)
- ret <2 x i32> %vcvtns_f321.i
-}
-
-define <4 x i32> @test_vcvtnq_s32_f32(<4 x float> %a) {
-; CHECK-LABEL: test_vcvtnq_s32_f32
-; CHECK: fcvtns v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
- %vcvtns_f321.i = call <4 x i32> @llvm.arm.neon.vcvtns.v4i32.v4f32(<4 x float> %a)
- ret <4 x i32> %vcvtns_f321.i
-}
-
-define <2 x i64> @test_vcvtnq_s64_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vcvtnq_s64_f64
-; CHECK: fcvtns v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
- %vcvtns_f641.i = call <2 x i64> @llvm.arm.neon.vcvtns.v2i64.v2f64(<2 x double> %a)
- ret <2 x i64> %vcvtns_f641.i
-}
-
-define <2 x i32> @test_vcvtn_u32_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vcvtn_u32_f32
-; CHECK: fcvtnu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
- %vcvtnu_f321.i = call <2 x i32> @llvm.arm.neon.vcvtnu.v2i32.v2f32(<2 x float> %a)
- ret <2 x i32> %vcvtnu_f321.i
-}
-
-define <4 x i32> @test_vcvtnq_u32_f32(<4 x float> %a) {
-; CHECK-LABEL: test_vcvtnq_u32_f32
-; CHECK: fcvtnu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
- %vcvtnu_f321.i = call <4 x i32> @llvm.arm.neon.vcvtnu.v4i32.v4f32(<4 x float> %a)
- ret <4 x i32> %vcvtnu_f321.i
-}
-
-define <2 x i64> @test_vcvtnq_u64_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vcvtnq_u64_f64
-; CHECK: fcvtnu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
- %vcvtnu_f641.i = call <2 x i64> @llvm.arm.neon.vcvtnu.v2i64.v2f64(<2 x double> %a)
- ret <2 x i64> %vcvtnu_f641.i
-}
-
-define <2 x i32> @test_vcvtp_s32_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vcvtp_s32_f32
-; CHECK: fcvtps v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
- %vcvtps_f321.i = call <2 x i32> @llvm.arm.neon.vcvtps.v2i32.v2f32(<2 x float> %a)
- ret <2 x i32> %vcvtps_f321.i
-}
-
-define <4 x i32> @test_vcvtpq_s32_f32(<4 x float> %a) {
-; CHECK-LABEL: test_vcvtpq_s32_f32
-; CHECK: fcvtps v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
- %vcvtps_f321.i = call <4 x i32> @llvm.arm.neon.vcvtps.v4i32.v4f32(<4 x float> %a)
- ret <4 x i32> %vcvtps_f321.i
-}
-
-define <2 x i64> @test_vcvtpq_s64_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vcvtpq_s64_f64
-; CHECK: fcvtps v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
- %vcvtps_f641.i = call <2 x i64> @llvm.arm.neon.vcvtps.v2i64.v2f64(<2 x double> %a)
- ret <2 x i64> %vcvtps_f641.i
-}
-
-define <2 x i32> @test_vcvtp_u32_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vcvtp_u32_f32
-; CHECK: fcvtpu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
- %vcvtpu_f321.i = call <2 x i32> @llvm.arm.neon.vcvtpu.v2i32.v2f32(<2 x float> %a)
- ret <2 x i32> %vcvtpu_f321.i
-}
-
-define <4 x i32> @test_vcvtpq_u32_f32(<4 x float> %a) {
-; CHECK-LABEL: test_vcvtpq_u32_f32
-; CHECK: fcvtpu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
- %vcvtpu_f321.i = call <4 x i32> @llvm.arm.neon.vcvtpu.v4i32.v4f32(<4 x float> %a)
- ret <4 x i32> %vcvtpu_f321.i
-}
-
-define <2 x i64> @test_vcvtpq_u64_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vcvtpq_u64_f64
-; CHECK: fcvtpu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
- %vcvtpu_f641.i = call <2 x i64> @llvm.arm.neon.vcvtpu.v2i64.v2f64(<2 x double> %a)
- ret <2 x i64> %vcvtpu_f641.i
-}
-
-define <2 x i32> @test_vcvtm_s32_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vcvtm_s32_f32
-; CHECK: fcvtms v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
- %vcvtms_f321.i = call <2 x i32> @llvm.arm.neon.vcvtms.v2i32.v2f32(<2 x float> %a)
- ret <2 x i32> %vcvtms_f321.i
-}
-
-define <4 x i32> @test_vcvtmq_s32_f32(<4 x float> %a) {
-; CHECK-LABEL: test_vcvtmq_s32_f32
-; CHECK: fcvtms v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
- %vcvtms_f321.i = call <4 x i32> @llvm.arm.neon.vcvtms.v4i32.v4f32(<4 x float> %a)
- ret <4 x i32> %vcvtms_f321.i
-}
-
-define <2 x i64> @test_vcvtmq_s64_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vcvtmq_s64_f64
-; CHECK: fcvtms v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
- %vcvtms_f641.i = call <2 x i64> @llvm.arm.neon.vcvtms.v2i64.v2f64(<2 x double> %a)
- ret <2 x i64> %vcvtms_f641.i
-}
-
-define <2 x i32> @test_vcvtm_u32_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vcvtm_u32_f32
-; CHECK: fcvtmu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
- %vcvtmu_f321.i = call <2 x i32> @llvm.arm.neon.vcvtmu.v2i32.v2f32(<2 x float> %a)
- ret <2 x i32> %vcvtmu_f321.i
-}
-
-define <4 x i32> @test_vcvtmq_u32_f32(<4 x float> %a) {
-; CHECK-LABEL: test_vcvtmq_u32_f32
-; CHECK: fcvtmu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
- %vcvtmu_f321.i = call <4 x i32> @llvm.arm.neon.vcvtmu.v4i32.v4f32(<4 x float> %a)
- ret <4 x i32> %vcvtmu_f321.i
-}
-
-define <2 x i64> @test_vcvtmq_u64_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vcvtmq_u64_f64
-; CHECK: fcvtmu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
- %vcvtmu_f641.i = call <2 x i64> @llvm.arm.neon.vcvtmu.v2i64.v2f64(<2 x double> %a)
- ret <2 x i64> %vcvtmu_f641.i
-}
-
-define <2 x i32> @test_vcvta_s32_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vcvta_s32_f32
-; CHECK: fcvtas v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
- %vcvtas_f321.i = call <2 x i32> @llvm.arm.neon.vcvtas.v2i32.v2f32(<2 x float> %a)
- ret <2 x i32> %vcvtas_f321.i
-}
-
-define <4 x i32> @test_vcvtaq_s32_f32(<4 x float> %a) {
-; CHECK-LABEL: test_vcvtaq_s32_f32
-; CHECK: fcvtas v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
- %vcvtas_f321.i = call <4 x i32> @llvm.arm.neon.vcvtas.v4i32.v4f32(<4 x float> %a)
- ret <4 x i32> %vcvtas_f321.i
-}
-
-define <2 x i64> @test_vcvtaq_s64_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vcvtaq_s64_f64
-; CHECK: fcvtas v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
- %vcvtas_f641.i = call <2 x i64> @llvm.arm.neon.vcvtas.v2i64.v2f64(<2 x double> %a)
- ret <2 x i64> %vcvtas_f641.i
-}
-
-define <2 x i32> @test_vcvta_u32_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vcvta_u32_f32
-; CHECK: fcvtau v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
- %vcvtau_f321.i = call <2 x i32> @llvm.arm.neon.vcvtau.v2i32.v2f32(<2 x float> %a)
- ret <2 x i32> %vcvtau_f321.i
-}
-
-define <4 x i32> @test_vcvtaq_u32_f32(<4 x float> %a) {
-; CHECK-LABEL: test_vcvtaq_u32_f32
-; CHECK: fcvtau v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
- %vcvtau_f321.i = call <4 x i32> @llvm.arm.neon.vcvtau.v4i32.v4f32(<4 x float> %a)
- ret <4 x i32> %vcvtau_f321.i
-}
-
-define <2 x i64> @test_vcvtaq_u64_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vcvtaq_u64_f64
-; CHECK: fcvtau v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
- %vcvtau_f641.i = call <2 x i64> @llvm.arm.neon.vcvtau.v2i64.v2f64(<2 x double> %a)
- ret <2 x i64> %vcvtau_f641.i
-}
-
-define <2 x float> @test_vrsqrte_f32(<2 x float> %a) #0 {
-; CHECK: frsqrte v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
- %vrsqrte1.i = tail call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %a) #4
- ret <2 x float> %vrsqrte1.i
-}
-
-define <4 x float> @test_vrsqrteq_f32(<4 x float> %a) #0 {
-; CHECK: frsqrte v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
- %vrsqrte1.i = tail call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %a) #4
- ret <4 x float> %vrsqrte1.i
-}
-
-define <2 x double> @test_vrsqrteq_f64(<2 x double> %a) #0 {
-; CHECK: frsqrte v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
- %vrsqrte1.i = tail call <2 x double> @llvm.arm.neon.vrsqrte.v2f64(<2 x double> %a) #4
- ret <2 x double> %vrsqrte1.i
-}
-
-define <2 x float> @test_vrecpe_f32(<2 x float> %a) #0 {
-; CHECK: frecpe v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
- %vrecpe1.i = tail call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %a) #4
- ret <2 x float> %vrecpe1.i
-}
-
-define <4 x float> @test_vrecpeq_f32(<4 x float> %a) #0 {
-; CHECK: frecpe v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
- %vrecpe1.i = tail call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %a) #4
- ret <4 x float> %vrecpe1.i
-}
-
-define <2 x double> @test_vrecpeq_f64(<2 x double> %a) #0 {
-; CHECK: frecpe v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
- %vrecpe1.i = tail call <2 x double> @llvm.arm.neon.vrecpe.v2f64(<2 x double> %a) #4
- ret <2 x double> %vrecpe1.i
-}
-
-define <2 x i32> @test_vrecpe_u32(<2 x i32> %a) #0 {
-; CHECK: urecpe v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
- %vrecpe1.i = tail call <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32> %a) #4
- ret <2 x i32> %vrecpe1.i
-}
-
-define <4 x i32> @test_vrecpeq_u32(<4 x i32> %a) #0 {
-; CHECK: urecpe v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
- %vrecpe1.i = tail call <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32> %a) #4
- ret <4 x i32> %vrecpe1.i
-}
-
-define <2 x float> @test_vsqrt_f32(<2 x float> %a) #0 {
-; CHECK: fsqrt v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
- %vsqrt1.i = tail call <2 x float> @llvm.sqrt.v2f32(<2 x float> %a) #4
- ret <2 x float> %vsqrt1.i
-}
-
-define <4 x float> @test_vsqrtq_f32(<4 x float> %a) #0 {
-; CHECK: fsqrt v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
- %vsqrt1.i = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a) #4
- ret <4 x float> %vsqrt1.i
-}
-
-define <2 x double> @test_vsqrtq_f64(<2 x double> %a) #0 {
-; CHECK: fsqrt v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
- %vsqrt1.i = tail call <2 x double> @llvm.sqrt.v2f64(<2 x double> %a) #4
- ret <2 x double> %vsqrt1.i
-}
-
-define <2 x float> @test_vcvt_f32_s32(<2 x i32> %a) #0 {
-; CHECK: scvtf v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
- %vcvt.i = sitofp <2 x i32> %a to <2 x float>
- ret <2 x float> %vcvt.i
-}
-
-define <2 x float> @test_vcvt_f32_u32(<2 x i32> %a) #0 {
-; CHECK: ucvtf v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
- %vcvt.i = uitofp <2 x i32> %a to <2 x float>
- ret <2 x float> %vcvt.i
-}
-
-define <4 x float> @test_vcvtq_f32_s32(<4 x i32> %a) #0 {
-; CHECK: scvtf v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
- %vcvt.i = sitofp <4 x i32> %a to <4 x float>
- ret <4 x float> %vcvt.i
-}
-
-define <4 x float> @test_vcvtq_f32_u32(<4 x i32> %a) #0 {
-; CHECK: ucvtf v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
- %vcvt.i = uitofp <4 x i32> %a to <4 x float>
- ret <4 x float> %vcvt.i
-}
-
-define <2 x double> @test_vcvtq_f64_s64(<2 x i64> %a) #0 {
-; CHECK: scvtf v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
- %vcvt.i = sitofp <2 x i64> %a to <2 x double>
- ret <2 x double> %vcvt.i
-}
-
-define <2 x double> @test_vcvtq_f64_u64(<2 x i64> %a) #0 {
-; CHECK: ucvtf v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
- %vcvt.i = uitofp <2 x i64> %a to <2 x double>
- ret <2 x double> %vcvt.i
-}
-
-define <2 x float> @test_vcvt_f32_s64(<2 x i64> %a) #0 {
-; CHECK: scvtf v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-; CHECK: fcvtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
- %vcvt.i = sitofp <2 x i64> %a to <2 x float>
- ret <2 x float> %vcvt.i
-}
-
-define <2 x float> @test_vcvt_f32_u64(<2 x i64> %a) #0 {
-; CHECK: ucvtf v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-; CHECK: fcvtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
- %vcvt.i = uitofp <2 x i64> %a to <2 x float>
- ret <2 x float> %vcvt.i
-}
-
-define <4 x float> @test_vcvt_f32_s16(<4 x i16> %a) #0 {
-; CHECK: sshll v{{[0-9]+}}.4s, v{{[0-9]+}}.4h, #0
-; CHECK: scvtf v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
- %vcvt.i = sitofp <4 x i16> %a to <4 x float>
- ret <4 x float> %vcvt.i
-}
-
-define <4 x float> @test_vcvt_f32_u16(<4 x i16> %a) #0 {
-; CHECK: ushll v{{[0-9]+}}.4s, v{{[0-9]+}}.4h, #0
-; CHECK: ucvtf v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
- %vcvt.i = uitofp <4 x i16> %a to <4 x float>
- ret <4 x float> %vcvt.i
-}
-
-define <2 x double> @test_vcvt_f64_s32(<2 x i32> %a) #0 {
-; CHECK: sshll v{{[0-9]+}}.2d, v{{[0-9]+}}.2s, #0
-; CHECK: scvtf v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
- %vcvt.i = sitofp <2 x i32> %a to <2 x double>
- ret <2 x double> %vcvt.i
-}
-
-define <2 x double> @test_vcvt_f64_u32(<2 x i32> %a) #0 {
-; CHECK: ushll v{{[0-9]+}}.2d, v{{[0-9]+}}.2s, #0
-; CHECK: ucvtf v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
- %vcvt.i = uitofp <2 x i32> %a to <2 x double>
- ret <2 x double> %vcvt.i
-}
-
-define <1 x double> @test_vcvt_f64_s8(<1 x i8> %a) #0 {
-; CHECK: umov w{{[0-9]+}}, v{{[0-9]+}}.b[0]
-; CHECK: sxtb w{{[0-9]+}}, w{{[0-9]+}}
-; CHECK: scvtf d{{[0-9]+}}, w{{[0-9]+}}
- %vcvt.i = sitofp <1 x i8> %a to <1 x double>
- ret <1 x double> %vcvt.i
-}
-
-define <1 x double> @test_vcvt_f64_u8(<1 x i8> %a) #0 {
-; CHECK: umov w{{[0-9]+}}, v{{[0-9]+}}.b[0]
-; CHECK: and w{{[0-9]+}}, w{{[0-9]+}}, #0xff
-; CHECK: ucvtf d{{[0-9]+}}, w{{[0-9]+}}
- %vcvt.i = uitofp <1 x i8> %a to <1 x double>
- ret <1 x double> %vcvt.i
-}
-
-define <1 x double> @test_vcvt_f64_s16(<1 x i16> %a) #0 {
-; CHECK: umov w{{[0-9]+}}, v{{[0-9]+}}.h[0]
-; CHECK: sxth w{{[0-9]+}}, w{{[0-9]+}}
-; CHECK: scvtf d{{[0-9]+}}, w{{[0-9]+}}
- %vcvt.i = sitofp <1 x i16> %a to <1 x double>
- ret <1 x double> %vcvt.i
-}
-
-define <1 x double> @test_vcvt_f64_u16(<1 x i16> %a) #0 {
-; CHECK: umov w{{[0-9]+}}, v{{[0-9]+}}.h[0]
-; CHECK: and w{{[0-9]+}}, w{{[0-9]+}}, #0xffff
-; CHECK: ucvtf d{{[0-9]+}}, w{{[0-9]+}}
- %vcvt.i = uitofp <1 x i16> %a to <1 x double>
- ret <1 x double> %vcvt.i
-}
-
-define <1 x double> @test_vcvt_f64_s32_v1(<1 x i32> %a) #0 {
-; CHECK: fmov w{{[0-9]+}}, s{{[0-9]+}}
-; CHECK: scvtf d{{[0-9]+}}, w{{[0-9]+}}
- %vcvt.i = sitofp <1 x i32> %a to <1 x double>
- ret <1 x double> %vcvt.i
-}
-
-define <1 x double> @test_vcvt_f64_u32_v1(<1 x i32> %a) #0 {
-; CHECK: fmov w{{[0-9]+}}, s{{[0-9]+}}
-; CHECK: ucvtf d{{[0-9]+}}, w{{[0-9]+}}
- %vcvt.i = uitofp <1 x i32> %a to <1 x double>
- ret <1 x double> %vcvt.i
-}
-
-declare <2 x double> @llvm.sqrt.v2f64(<2 x double>) #2
-
-declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) #2
-
-declare <2 x float> @llvm.sqrt.v2f32(<2 x float>) #2
-
-declare <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32>) #2
-
-declare <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32>) #2
-
-declare <2 x double> @llvm.arm.neon.vrecpe.v2f64(<2 x double>) #2
-
-declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) #2
-
-declare <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float>) #2
-
-declare <2 x double> @llvm.arm.neon.vrsqrte.v2f64(<2 x double>) #2
-
-declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) #2
-
-declare <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float>) #2
-
-declare <2 x i64> @llvm.arm.neon.vcvtau.v2i64.v2f64(<2 x double>)
-
-declare <4 x i32> @llvm.arm.neon.vcvtau.v4i32.v4f32(<4 x float>)
-
-declare <2 x i32> @llvm.arm.neon.vcvtau.v2i32.v2f32(<2 x float>)
-
-declare <2 x i64> @llvm.arm.neon.vcvtas.v2i64.v2f64(<2 x double>)
-
-declare <4 x i32> @llvm.arm.neon.vcvtas.v4i32.v4f32(<4 x float>)
-
-declare <2 x i32> @llvm.arm.neon.vcvtas.v2i32.v2f32(<2 x float>)
-
-declare <2 x i64> @llvm.arm.neon.vcvtmu.v2i64.v2f64(<2 x double>)
-
-declare <4 x i32> @llvm.arm.neon.vcvtmu.v4i32.v4f32(<4 x float>)
-
-declare <2 x i32> @llvm.arm.neon.vcvtmu.v2i32.v2f32(<2 x float>)
-
-declare <2 x i64> @llvm.arm.neon.vcvtms.v2i64.v2f64(<2 x double>)
-
-declare <4 x i32> @llvm.arm.neon.vcvtms.v4i32.v4f32(<4 x float>)
-
-declare <2 x i32> @llvm.arm.neon.vcvtms.v2i32.v2f32(<2 x float>)
-
-declare <2 x i64> @llvm.arm.neon.vcvtpu.v2i64.v2f64(<2 x double>)
-
-declare <4 x i32> @llvm.arm.neon.vcvtpu.v4i32.v4f32(<4 x float>)
-
-declare <2 x i32> @llvm.arm.neon.vcvtpu.v2i32.v2f32(<2 x float>)
-
-declare <2 x i64> @llvm.arm.neon.vcvtps.v2i64.v2f64(<2 x double>)
-
-declare <4 x i32> @llvm.arm.neon.vcvtps.v4i32.v4f32(<4 x float>)
-
-declare <2 x i32> @llvm.arm.neon.vcvtps.v2i32.v2f32(<2 x float>)
-
-declare <2 x i64> @llvm.arm.neon.vcvtnu.v2i64.v2f64(<2 x double>)
-
-declare <4 x i32> @llvm.arm.neon.vcvtnu.v4i32.v4f32(<4 x float>)
-
-declare <2 x i32> @llvm.arm.neon.vcvtnu.v2i32.v2f32(<2 x float>)
-
-declare <2 x i64> @llvm.arm.neon.vcvtns.v2i64.v2f64(<2 x double>)
-
-declare <4 x i32> @llvm.arm.neon.vcvtns.v4i32.v4f32(<4 x float>)
-
-declare <2 x i32> @llvm.arm.neon.vcvtns.v2i32.v2f32(<2 x float>)
-
-declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>) #3
-
-declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>) #3
-
-declare <2 x float> @llvm.nearbyint.v2f32(<2 x float>) #3
-
-declare <2 x double> @llvm.trunc.v2f64(<2 x double>) #3
-
-declare <4 x float> @llvm.trunc.v4f32(<4 x float>) #3
-
-declare <2 x float> @llvm.trunc.v2f32(<2 x float>) #3
-
-declare <2 x double> @llvm.rint.v2f64(<2 x double>) #3
-
-declare <4 x float> @llvm.rint.v4f32(<4 x float>) #3
-
-declare <2 x float> @llvm.rint.v2f32(<2 x float>) #3
-
-declare <2 x double> @llvm.floor.v2f64(<2 x double>) #3
-
-declare <4 x float> @llvm.floor.v4f32(<4 x float>) #3
-
-declare <2 x float> @llvm.floor.v2f32(<2 x float>) #3
-
-declare <2 x double> @llvm.ceil.v2f64(<2 x double>) #3
-
-declare <4 x float> @llvm.ceil.v4f32(<4 x float>) #3
-
-declare <2 x float> @llvm.ceil.v2f32(<2 x float>) #3
-
-declare <2 x double> @llvm.round.v2f64(<2 x double>) #3
-
-declare <4 x float> @llvm.round.v4f32(<4 x float>) #3
-
-declare <2 x float> @llvm.round.v2f32(<2 x float>) #3
-
-declare <2 x double> @llvm.aarch64.neon.frintn.v2f64(<2 x double>) #2
-
-declare <4 x float> @llvm.aarch64.neon.frintn.v4f32(<4 x float>) #2
-
-declare <2 x float> @llvm.aarch64.neon.frintn.v2f32(<2 x float>) #2
-
-declare <2 x float> @llvm.aarch64.neon.vcvtxn.v2f32.v2f64(<2 x double>) #2
-
-declare <2 x float> @llvm.aarch64.neon.fcvtn.v2f32.v2f64(<2 x double>) #2
-
-declare <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64>) #2
-
-declare <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32>) #2
-
-declare <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16>) #2
-
-declare <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64>) #2
-
-declare <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32>) #2
-
-declare <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16>) #2
-
-declare <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64>) #2
-
-declare <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32>) #2
-
-declare <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16>) #2
-
-declare <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8>) #2
-
-declare <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8>) #2
-
-declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>) #2
-
-declare <8 x i8> @llvm.ctpop.v8i8(<8 x i8>) #2
-
-declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) #2
-
-declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) #2
-
-declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1) #2
-
-declare <4 x i16> @llvm.ctlz.v4i16(<4 x i16>, i1) #2
-
-declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>, i1) #2
-
-declare <8 x i8> @llvm.ctlz.v8i8(<8 x i8>, i1) #2
-
-declare <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32>) #2
-
-declare <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32>) #2
-
-declare <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16>) #2
-
-declare <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16>) #2
-
-declare <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8>) #2
-
-declare <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8>) #2
-
-declare <2 x i64> @llvm.aarch64.neon.suqadd.v2i64(<2 x i64>, <2 x i64>) #2
-
-declare <4 x i32> @llvm.aarch64.neon.suqadd.v4i32(<4 x i32>, <4 x i32>) #2
-
-declare <2 x i32> @llvm.aarch64.neon.suqadd.v2i32(<2 x i32>, <2 x i32>) #2
-
-declare <8 x i16> @llvm.aarch64.neon.suqadd.v8i16(<8 x i16>, <8 x i16>) #2
-
-declare <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16>, <4 x i16>) #2
-
-declare <16 x i8> @llvm.aarch64.neon.suqadd.v16i8(<16 x i8>, <16 x i8>) #2
-
-declare <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8>, <8 x i8>) #2
-
-declare <2 x double> @llvm.fabs.v2f64(<2 x double>) #3
-
-declare <4 x float> @llvm.fabs.v4f32(<4 x float>) #3
-
-declare <2 x float> @llvm.fabs.v2f32(<2 x float>) #3
-
-declare <2 x i64> @llvm.arm.neon.vabs.v2i64(<2 x i64>) #2
-
-declare <4 x i32> @llvm.arm.neon.vabs.v4i32(<4 x i32>) #2
-
-declare <2 x i32> @llvm.arm.neon.vabs.v2i32(<2 x i32>) #2
-
-declare <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16>) #2
-
-declare <4 x i16> @llvm.arm.neon.vabs.v4i16(<4 x i16>) #2
-
-declare <16 x i8> @llvm.arm.neon.vabs.v16i8(<16 x i8>) #2
-
-declare <8 x i8> @llvm.arm.neon.vabs.v8i8(<8 x i8>) #2
-
-declare <2 x i64> @llvm.arm.neon.vqneg.v2i64(<2 x i64>) #2
-
-declare <4 x i32> @llvm.arm.neon.vqneg.v4i32(<4 x i32>) #2
-
-declare <2 x i32> @llvm.arm.neon.vqneg.v2i32(<2 x i32>) #2
-
-declare <8 x i16> @llvm.arm.neon.vqneg.v8i16(<8 x i16>) #2
-
-declare <4 x i16> @llvm.arm.neon.vqneg.v4i16(<4 x i16>) #2
-
-declare <16 x i8> @llvm.arm.neon.vqneg.v16i8(<16 x i8>) #2
-
-declare <8 x i8> @llvm.arm.neon.vqneg.v8i8(<8 x i8>) #2
-
-declare <2 x i64> @llvm.arm.neon.vqabs.v2i64(<2 x i64>) #2
-
-declare <4 x i32> @llvm.arm.neon.vqabs.v4i32(<4 x i32>) #2
-
-declare <2 x i32> @llvm.arm.neon.vqabs.v2i32(<2 x i32>) #2
-
-declare <8 x i16> @llvm.arm.neon.vqabs.v8i16(<8 x i16>) #2
-
-declare <4 x i16> @llvm.arm.neon.vqabs.v4i16(<4 x i16>) #2
-
-declare <16 x i8> @llvm.arm.neon.vqabs.v16i8(<16 x i8>) #2
-
-declare <8 x i8> @llvm.arm.neon.vqabs.v8i8(<8 x i8>) #2
-
-declare <2 x i64> @llvm.arm.neon.vpadalu.v2i64.v4i32(<2 x i64>, <4 x i32>) #2
-
-declare <4 x i32> @llvm.arm.neon.vpadalu.v4i32.v8i16(<4 x i32>, <8 x i16>) #2
-
-declare <8 x i16> @llvm.arm.neon.vpadalu.v8i16.v16i8(<8 x i16>, <16 x i8>) #2
-
-declare <2 x i64> @llvm.arm.neon.vpadals.v2i64.v4i32(<2 x i64>, <4 x i32>) #2
-
-declare <4 x i32> @llvm.arm.neon.vpadals.v4i32.v8i16(<4 x i32>, <8 x i16>) #2
-
-declare <8 x i16> @llvm.arm.neon.vpadals.v8i16.v16i8(<8 x i16>, <16 x i8>) #2
-
-declare <1 x i64> @llvm.arm.neon.vpadalu.v1i64.v2i32(<1 x i64>, <2 x i32>) #2
-
-declare <2 x i32> @llvm.arm.neon.vpadalu.v2i32.v4i16(<2 x i32>, <4 x i16>) #2
-
-declare <4 x i16> @llvm.arm.neon.vpadalu.v4i16.v8i8(<4 x i16>, <8 x i8>) #2
-
-declare <1 x i64> @llvm.arm.neon.vpadals.v1i64.v2i32(<1 x i64>, <2 x i32>) #2
-
-declare <2 x i32> @llvm.arm.neon.vpadals.v2i32.v4i16(<2 x i32>, <4 x i16>) #2
-
-declare <4 x i16> @llvm.arm.neon.vpadals.v4i16.v8i8(<4 x i16>, <8 x i8>) #2
-
-declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) #2
-
-declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16>) #2
-
-declare <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8>) #2
-
-declare <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32>) #2
-
-declare <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16>) #2
-
-declare <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8>) #2
-
-declare <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32>) #2
-
-declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) #2
-
-declare <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8>) #2
-
-declare <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32>) #2
-
-declare <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16>) #2
-
-declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) #2
-
-declare <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16>) #2
-
-declare <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float>) #2
-
-
-define <1 x i64> @test_vcvt_s64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvt_s64_f64
-; CHECK: fcvtzs d{{[0-9]+}}, d{{[0-9]+}}
- %1 = fptosi <1 x double> %a to <1 x i64>
- ret <1 x i64> %1
-}
-
-define <1 x i64> @test_vcvt_u64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvt_u64_f64
-; CHECK: fcvtzu d{{[0-9]+}}, d{{[0-9]+}}
- %1 = fptoui <1 x double> %a to <1 x i64>
- ret <1 x i64> %1
-}
-
-define <1 x i64> @test_vcvtn_s64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvtn_s64_f64
-; CHECK: fcvtns d{{[0-9]+}}, d{{[0-9]+}}
- %1 = call <1 x i64> @llvm.arm.neon.vcvtns.v1i64.v1f64(<1 x double> %a)
- ret <1 x i64> %1
-}
-
-define <1 x i64> @test_vcvtn_u64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvtn_u64_f64
-; CHECK: fcvtnu d{{[0-9]+}}, d{{[0-9]+}}
- %1 = call <1 x i64> @llvm.arm.neon.vcvtnu.v1i64.v1f64(<1 x double> %a)
- ret <1 x i64> %1
-}
-
-define <1 x i64> @test_vcvtp_s64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvtp_s64_f64
-; CHECK: fcvtps d{{[0-9]+}}, d{{[0-9]+}}
- %1 = call <1 x i64> @llvm.arm.neon.vcvtps.v1i64.v1f64(<1 x double> %a)
- ret <1 x i64> %1
-}
-
-define <1 x i64> @test_vcvtp_u64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvtp_u64_f64
-; CHECK: fcvtpu d{{[0-9]+}}, d{{[0-9]+}}
- %1 = call <1 x i64> @llvm.arm.neon.vcvtpu.v1i64.v1f64(<1 x double> %a)
- ret <1 x i64> %1
-}
-
-define <1 x i64> @test_vcvtm_s64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvtm_s64_f64
-; CHECK: fcvtms d{{[0-9]+}}, d{{[0-9]+}}
- %1 = call <1 x i64> @llvm.arm.neon.vcvtms.v1i64.v1f64(<1 x double> %a)
- ret <1 x i64> %1
-}
-
-define <1 x i64> @test_vcvtm_u64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvtm_u64_f64
-; CHECK: fcvtmu d{{[0-9]+}}, d{{[0-9]+}}
- %1 = call <1 x i64> @llvm.arm.neon.vcvtmu.v1i64.v1f64(<1 x double> %a)
- ret <1 x i64> %1
-}
-
-define <1 x i64> @test_vcvta_s64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvta_s64_f64
-; CHECK: fcvtas d{{[0-9]+}}, d{{[0-9]+}}
- %1 = call <1 x i64> @llvm.arm.neon.vcvtas.v1i64.v1f64(<1 x double> %a)
- ret <1 x i64> %1
-}
-
-define <1 x i64> @test_vcvta_u64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvta_u64_f64
-; CHECK: fcvtau d{{[0-9]+}}, d{{[0-9]+}}
- %1 = call <1 x i64> @llvm.arm.neon.vcvtau.v1i64.v1f64(<1 x double> %a)
- ret <1 x i64> %1
-}
-
-define <1 x double> @test_vcvt_f64_s64(<1 x i64> %a) {
-; CHECK-LABEL: test_vcvt_f64_s64
-; CHECK: scvtf d{{[0-9]+}}, d{{[0-9]+}}
- %1 = sitofp <1 x i64> %a to <1 x double>
- ret <1 x double> %1
-}
-
-define <1 x double> @test_vcvt_f64_u64(<1 x i64> %a) {
-; CHECK-LABEL: test_vcvt_f64_u64
-; CHECK: ucvtf d{{[0-9]+}}, d{{[0-9]+}}
- %1 = uitofp <1 x i64> %a to <1 x double>
- ret <1 x double> %1
-}
-
-declare <1 x i64> @llvm.arm.neon.vcvtau.v1i64.v1f64(<1 x double>)
-declare <1 x i64> @llvm.arm.neon.vcvtas.v1i64.v1f64(<1 x double>)
-declare <1 x i64> @llvm.arm.neon.vcvtmu.v1i64.v1f64(<1 x double>)
-declare <1 x i64> @llvm.arm.neon.vcvtms.v1i64.v1f64(<1 x double>)
-declare <1 x i64> @llvm.arm.neon.vcvtpu.v1i64.v1f64(<1 x double>)
-declare <1 x i64> @llvm.arm.neon.vcvtps.v1i64.v1f64(<1 x double>)
-declare <1 x i64> @llvm.arm.neon.vcvtnu.v1i64.v1f64(<1 x double>)
-declare <1 x i64> @llvm.arm.neon.vcvtns.v1i64.v1f64(<1 x double>)
-
-define <1 x double> @test_vrndn_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vrndn_f64
-; CHECK: frintn d{{[0-9]+}}, d{{[0-9]+}}
- %1 = tail call <1 x double> @llvm.aarch64.neon.frintn.v1f64(<1 x double> %a)
- ret <1 x double> %1
-}
-
-define <1 x double> @test_vrnda_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vrnda_f64
-; CHECK: frinta d{{[0-9]+}}, d{{[0-9]+}}
- %1 = tail call <1 x double> @llvm.round.v1f64(<1 x double> %a)
- ret <1 x double> %1
-}
-
-define <1 x double> @test_vrndp_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vrndp_f64
-; CHECK: frintp d{{[0-9]+}}, d{{[0-9]+}}
- %1 = tail call <1 x double> @llvm.ceil.v1f64(<1 x double> %a)
- ret <1 x double> %1
-}
-
-define <1 x double> @test_vrndm_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vrndm_f64
-; CHECK: frintm d{{[0-9]+}}, d{{[0-9]+}}
- %1 = tail call <1 x double> @llvm.floor.v1f64(<1 x double> %a)
- ret <1 x double> %1
-}
-
-define <1 x double> @test_vrndx_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vrndx_f64
-; CHECK: frintx d{{[0-9]+}}, d{{[0-9]+}}
- %1 = tail call <1 x double> @llvm.rint.v1f64(<1 x double> %a)
- ret <1 x double> %1
-}
-
-define <1 x double> @test_vrnd_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vrnd_f64
-; CHECK: frintz d{{[0-9]+}}, d{{[0-9]+}}
- %1 = tail call <1 x double> @llvm.trunc.v1f64(<1 x double> %a)
- ret <1 x double> %1
-}
-
-define <1 x double> @test_vrndi_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vrndi_f64
-; CHECK: frinti d{{[0-9]+}}, d{{[0-9]+}}
- %1 = tail call <1 x double> @llvm.nearbyint.v1f64(<1 x double> %a)
- ret <1 x double> %1
-}
-
-declare <1 x double> @llvm.nearbyint.v1f64(<1 x double>)
-declare <1 x double> @llvm.trunc.v1f64(<1 x double>)
-declare <1 x double> @llvm.rint.v1f64(<1 x double>)
-declare <1 x double> @llvm.floor.v1f64(<1 x double>)
-declare <1 x double> @llvm.ceil.v1f64(<1 x double>)
-declare <1 x double> @llvm.round.v1f64(<1 x double>)
-declare <1 x double> @llvm.aarch64.neon.frintn.v1f64(<1 x double>)
-
-define <1 x double> @test_vrsqrte_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vrsqrte_f64
-; CHECK: frsqrte d{{[0-9]+}}, d{{[0-9]+}}
- %1 = tail call <1 x double> @llvm.arm.neon.vrsqrte.v1f64(<1 x double> %a)
- ret <1 x double> %1
-}
-
-define <1 x double> @test_vrecpe_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vrecpe_f64
-; CHECK: frecpe d{{[0-9]+}}, d{{[0-9]+}}
- %1 = tail call <1 x double> @llvm.arm.neon.vrecpe.v1f64(<1 x double> %a)
- ret <1 x double> %1
-}
-
-define <1 x double> @test_vsqrt_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vsqrt_f64
-; CHECK: fsqrt d{{[0-9]+}}, d{{[0-9]+}}
- %1 = tail call <1 x double> @llvm.sqrt.v1f64(<1 x double> %a)
- ret <1 x double> %1
-}
-
-define <1 x double> @test_vrecps_f64(<1 x double> %a, <1 x double> %b) {
-; CHECK-LABEL: test_vrecps_f64
-; CHECK: frecps d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
- %1 = tail call <1 x double> @llvm.arm.neon.vrecps.v1f64(<1 x double> %a, <1 x double> %b)
- ret <1 x double> %1
-}
-
-define <1 x double> @test_vrsqrts_f64(<1 x double> %a, <1 x double> %b) {
-; CHECK-LABEL: test_vrsqrts_f64
-; CHECK: frsqrts d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
- %1 = tail call <1 x double> @llvm.arm.neon.vrsqrts.v1f64(<1 x double> %a, <1 x double> %b)
- ret <1 x double> %1
-}
-
-declare <1 x double> @llvm.arm.neon.vrsqrts.v1f64(<1 x double>, <1 x double>)
-declare <1 x double> @llvm.arm.neon.vrecps.v1f64(<1 x double>, <1 x double>)
-declare <1 x double> @llvm.sqrt.v1f64(<1 x double>)
-declare <1 x double> @llvm.arm.neon.vrecpe.v1f64(<1 x double>)
-declare <1 x double> @llvm.arm.neon.vrsqrte.v1f64(<1 x double>)
-
-define i64 @test_vaddlv_s32(<2 x i32> %a) {
-; CHECK-LABEL: test_vaddlv_s32
-; CHECK: saddlp {{v[0-9]+}}.1d, {{v[0-9]+}}.2s
- %1 = tail call <1 x i64> @llvm.aarch64.neon.saddlv.v1i64.v2i32(<2 x i32> %a)
- %2 = extractelement <1 x i64> %1, i32 0
- ret i64 %2
-}
-
-define i64 @test_vaddlv_u32(<2 x i32> %a) {
-; CHECK-LABEL: test_vaddlv_u32
-; CHECK: uaddlp {{v[0-9]+}}.1d, {{v[0-9]+}}.2s
- %1 = tail call <1 x i64> @llvm.aarch64.neon.uaddlv.v1i64.v2i32(<2 x i32> %a)
- %2 = extractelement <1 x i64> %1, i32 0
- ret i64 %2
-}
-
-declare <1 x i64> @llvm.aarch64.neon.saddlv.v1i64.v2i32(<2 x i32>)
-declare <1 x i64> @llvm.aarch64.neon.uaddlv.v1i64.v2i32(<2 x i32>) \ No newline at end of file
diff --git a/test/CodeGen/AArch64/neon-mov.ll b/test/CodeGen/AArch64/neon-mov.ll
index 4035b91..40649ae 100644
--- a/test/CodeGen/AArch64/neon-mov.ll
+++ b/test/CodeGen/AArch64/neon-mov.ll
@@ -1,218 +1,259 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK
define <8 x i8> @movi8b() {
-;CHECK: movi {{v[0-9]+}}.8b, #0x8
+; CHECK-LABEL: movi8b:
+; CHECK: movi {{v[0-9]+}}.8b, #{{0x8|8}}
ret <8 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
}
define <16 x i8> @movi16b() {
-;CHECK: movi {{v[0-9]+}}.16b, #0x8
+; CHECK-LABEL: movi16b:
+; CHECK: movi {{v[0-9]+}}.16b, #{{0x8|8}}
ret <16 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
}
define <2 x i32> @movi2s_lsl0() {
-;CHECK: movi {{v[0-9]+}}.2s, #0xff
+; CHECK-LABEL: movi2s_lsl0:
+; CHECK: movi {{d[0-9]+}}, #0x0000ff000000ff
ret <2 x i32> < i32 255, i32 255 >
}
define <2 x i32> @movi2s_lsl8() {
-;CHECK: movi {{v[0-9]+}}.2s, #0xff, lsl #8
+; CHECK-LABEL: movi2s_lsl8:
+; CHECK: movi {{d[0-9]+}}, #0x00ff000000ff00
ret <2 x i32> < i32 65280, i32 65280 >
}
define <2 x i32> @movi2s_lsl16() {
-;CHECK: movi {{v[0-9]+}}.2s, #0xff, lsl #16
+; CHECK-LABEL: movi2s_lsl16:
+; CHECK: movi {{d[0-9]+}}, #0xff000000ff0000
ret <2 x i32> < i32 16711680, i32 16711680 >
}
define <2 x i32> @movi2s_lsl24() {
-;CHECK: movi {{v[0-9]+}}.2s, #0xff, lsl #24
+; CHECK-LABEL: movi2s_lsl24:
+; CHECK: movi {{d[0-9]+}}, #0xff000000ff000000
ret <2 x i32> < i32 4278190080, i32 4278190080 >
}
define <4 x i32> @movi4s_lsl0() {
-;CHECK: movi {{v[0-9]+}}.4s, #0xff
+; CHECK-LABEL: movi4s_lsl0:
+; CHECK: movi {{v[0-9]+}}.2d, #0x0000ff000000ff
ret <4 x i32> < i32 255, i32 255, i32 255, i32 255 >
}
define <4 x i32> @movi4s_lsl8() {
-;CHECK: movi {{v[0-9]+}}.4s, #0xff, lsl #8
+; CHECK-LABEL: movi4s_lsl8:
+; CHECK: movi {{v[0-9]+}}.2d, #0x00ff000000ff00
ret <4 x i32> < i32 65280, i32 65280, i32 65280, i32 65280 >
}
define <4 x i32> @movi4s_lsl16() {
-;CHECK: movi {{v[0-9]+}}.4s, #0xff, lsl #16
+; CHECK-LABEL: movi4s_lsl16:
+; CHECK: movi {{v[0-9]+}}.2d, #0xff000000ff0000
ret <4 x i32> < i32 16711680, i32 16711680, i32 16711680, i32 16711680 >
}
define <4 x i32> @movi4s_lsl24() {
-;CHECK: movi {{v[0-9]+}}.4s, #0xff, lsl #24
+; CHECK-LABEL: movi4s_lsl24:
+; CHECK: movi {{v[0-9]+}}.2d, #0xff000000ff000000
ret <4 x i32> < i32 4278190080, i32 4278190080, i32 4278190080, i32 4278190080 >
}
define <4 x i16> @movi4h_lsl0() {
-;CHECK: movi {{v[0-9]+}}.4h, #0xff
+; CHECK-LABEL: movi4h_lsl0:
+; CHECK: movi {{d[0-9]+}}, #0xff00ff00ff00ff
ret <4 x i16> < i16 255, i16 255, i16 255, i16 255 >
}
define <4 x i16> @movi4h_lsl8() {
-;CHECK: movi {{v[0-9]+}}.4h, #0xff, lsl #8
+; CHECK-LABEL: movi4h_lsl8:
+; CHECK: movi d0, #0xff00ff00ff00ff00
ret <4 x i16> < i16 65280, i16 65280, i16 65280, i16 65280 >
}
define <8 x i16> @movi8h_lsl0() {
-;CHECK: movi {{v[0-9]+}}.8h, #0xff
+; CHECK-LABEL: movi8h_lsl0:
+; CHECK: movi v0.2d, #0xff00ff00ff00ff
ret <8 x i16> < i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255 >
}
define <8 x i16> @movi8h_lsl8() {
-;CHECK: movi {{v[0-9]+}}.8h, #0xff, lsl #8
+; CHECK-LABEL: movi8h_lsl8:
+; CHECK: movi v0.2d, #0xff00ff00ff00ff00
ret <8 x i16> < i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280 >
}
define <2 x i32> @mvni2s_lsl0() {
-;CHECK: mvni {{v[0-9]+}}.2s, #0x10
+; CHECK-LABEL: mvni2s_lsl0:
+; CHECK: mvni {{v[0-9]+}}.2s, #{{0x10|16}}
ret <2 x i32> < i32 4294967279, i32 4294967279 >
}
define <2 x i32> @mvni2s_lsl8() {
-;CHECK: mvni {{v[0-9]+}}.2s, #0x10, lsl #8
+; CHECK-LABEL: mvni2s_lsl8:
+; CHECK: mvni {{v[0-9]+}}.2s, #{{0x10|16}}, lsl #8
ret <2 x i32> < i32 4294963199, i32 4294963199 >
}
define <2 x i32> @mvni2s_lsl16() {
-;CHECK: mvni {{v[0-9]+}}.2s, #0x10, lsl #16
+; CHECK-LABEL: mvni2s_lsl16:
+; CHECK: mvni {{v[0-9]+}}.2s, #{{0x10|16}}, lsl #16
ret <2 x i32> < i32 4293918719, i32 4293918719 >
}
define <2 x i32> @mvni2s_lsl24() {
-;CHECK: mvni {{v[0-9]+}}.2s, #0x10, lsl #24
+; CHECK-LABEL: mvni2s_lsl24:
+; CHECK: mvni {{v[0-9]+}}.2s, #{{0x10|16}}, lsl #24
ret <2 x i32> < i32 4026531839, i32 4026531839 >
}
define <4 x i32> @mvni4s_lsl0() {
-;CHECK: mvni {{v[0-9]+}}.4s, #0x10
+; CHECK-LABEL: mvni4s_lsl0:
+; CHECK: mvni {{v[0-9]+}}.4s, #{{0x10|16}}
ret <4 x i32> < i32 4294967279, i32 4294967279, i32 4294967279, i32 4294967279 >
}
define <4 x i32> @mvni4s_lsl8() {
-;CHECK: mvni {{v[0-9]+}}.4s, #0x10, lsl #8
+; CHECK-LABEL: mvni4s_lsl8:
+; CHECK: mvni {{v[0-9]+}}.4s, #{{0x10|16}}, lsl #8
ret <4 x i32> < i32 4294963199, i32 4294963199, i32 4294963199, i32 4294963199 >
}
define <4 x i32> @mvni4s_lsl16() {
-;CHECK: mvni {{v[0-9]+}}.4s, #0x10, lsl #16
+; CHECK-LABEL: mvni4s_lsl16:
+; CHECK: mvni {{v[0-9]+}}.4s, #{{0x10|16}}, lsl #16
ret <4 x i32> < i32 4293918719, i32 4293918719, i32 4293918719, i32 4293918719 >
}
define <4 x i32> @mvni4s_lsl24() {
-;CHECK: mvni {{v[0-9]+}}.4s, #0x10, lsl #24
+; CHECK-LABEL: mvni4s_lsl24:
+; CHECK: mvni {{v[0-9]+}}.4s, #{{0x10|16}}, lsl #24
ret <4 x i32> < i32 4026531839, i32 4026531839, i32 4026531839, i32 4026531839 >
}
define <4 x i16> @mvni4h_lsl0() {
-;CHECK: mvni {{v[0-9]+}}.4h, #0x10
+; CHECK-LABEL: mvni4h_lsl0:
+; CHECK: mvni {{v[0-9]+}}.4h, #{{0x10|16}}
ret <4 x i16> < i16 65519, i16 65519, i16 65519, i16 65519 >
}
define <4 x i16> @mvni4h_lsl8() {
-;CHECK: mvni {{v[0-9]+}}.4h, #0x10, lsl #8
+; CHECK-LABEL: mvni4h_lsl8:
+; CHECK: mvni {{v[0-9]+}}.4h, #{{0x10|16}}, lsl #8
ret <4 x i16> < i16 61439, i16 61439, i16 61439, i16 61439 >
}
define <8 x i16> @mvni8h_lsl0() {
-;CHECK: mvni {{v[0-9]+}}.8h, #0x10
+; CHECK-LABEL: mvni8h_lsl0:
+; CHECK: mvni {{v[0-9]+}}.8h, #{{0x10|16}}
ret <8 x i16> < i16 65519, i16 65519, i16 65519, i16 65519, i16 65519, i16 65519, i16 65519, i16 65519 >
}
define <8 x i16> @mvni8h_lsl8() {
-;CHECK: mvni {{v[0-9]+}}.8h, #0x10, lsl #8
+; CHECK-LABEL: mvni8h_lsl8:
+; CHECK: mvni {{v[0-9]+}}.8h, #{{0x10|16}}, lsl #8
ret <8 x i16> < i16 61439, i16 61439, i16 61439, i16 61439, i16 61439, i16 61439, i16 61439, i16 61439 >
}
define <2 x i32> @movi2s_msl8(<2 x i32> %a) {
-;CHECK: movi {{v[0-9]+}}.2s, #0xff, msl #8
+; CHECK-LABEL: movi2s_msl8:
+; CHECK: movi {{d[0-9]+}}, #0x00ffff0000ffff
ret <2 x i32> < i32 65535, i32 65535 >
}
define <2 x i32> @movi2s_msl16() {
-;CHECK: movi {{v[0-9]+}}.2s, #0xff, msl #16
+; CHECK-LABEL: movi2s_msl16:
+; CHECK: movi d0, #0xffffff00ffffff
ret <2 x i32> < i32 16777215, i32 16777215 >
}
define <4 x i32> @movi4s_msl8() {
-;CHECK: movi {{v[0-9]+}}.4s, #0xff, msl #8
+; CHECK-LABEL: movi4s_msl8:
+; CHECK: movi v0.2d, #0x00ffff0000ffff
ret <4 x i32> < i32 65535, i32 65535, i32 65535, i32 65535 >
}
define <4 x i32> @movi4s_msl16() {
-;CHECK: movi {{v[0-9]+}}.4s, #0xff, msl #16
+; CHECK-LABEL: movi4s_msl16:
+; CHECK: movi v0.2d, #0xffffff00ffffff
ret <4 x i32> < i32 16777215, i32 16777215, i32 16777215, i32 16777215 >
}
define <2 x i32> @mvni2s_msl8() {
-;CHECK: mvni {{v[0-9]+}}.2s, #0x10, msl #8
+; CHECK-LABEL: mvni2s_msl8:
+; CHECK: mvni {{v[0-9]+}}.2s, #{{0x10|16}}, msl #8
ret <2 x i32> < i32 18446744073709547264, i32 18446744073709547264>
}
define <2 x i32> @mvni2s_msl16() {
-;CHECK: mvni {{v[0-9]+}}.2s, #0x10, msl #16
+; CHECK-LABEL: mvni2s_msl16:
+; CHECK: mvni {{v[0-9]+}}.2s, #{{0x10|16}}, msl #16
ret <2 x i32> < i32 18446744073708437504, i32 18446744073708437504>
}
define <4 x i32> @mvni4s_msl8() {
-;CHECK: mvni {{v[0-9]+}}.4s, #0x10, msl #8
+; CHECK-LABEL: mvni4s_msl8:
+; CHECK: mvni {{v[0-9]+}}.4s, #{{0x10|16}}, msl #8
ret <4 x i32> < i32 18446744073709547264, i32 18446744073709547264, i32 18446744073709547264, i32 18446744073709547264>
}
define <4 x i32> @mvni4s_msl16() {
-;CHECK: mvni {{v[0-9]+}}.4s, #0x10, msl #16
+; CHECK-LABEL: mvni4s_msl16:
+; CHECK: mvni {{v[0-9]+}}.4s, #{{0x10|16}}, msl #16
ret <4 x i32> < i32 18446744073708437504, i32 18446744073708437504, i32 18446744073708437504, i32 18446744073708437504>
}
define <2 x i64> @movi2d() {
-;CHECK: movi {{v[0-9]+}}.2d, #0xff0000ff0000ffff
+; CHECK-LABEL: movi2d:
+; CHECK: movi {{v[0-9]+}}.2d, #0xff0000ff0000ffff
ret <2 x i64> < i64 18374687574888349695, i64 18374687574888349695 >
}
define <1 x i64> @movid() {
-;CHECK: movi {{d[0-9]+}}, #0xff0000ff0000ffff
+; CHECK-LABEL: movid:
+; CHECK: movi {{d[0-9]+}}, #0xff0000ff0000ffff
ret <1 x i64> < i64 18374687574888349695 >
}
define <2 x float> @fmov2s() {
-;CHECK: fmov {{v[0-9]+}}.2s, #-12.00000000
+; CHECK-LABEL: fmov2s:
+; CHECK: fmov {{v[0-9]+}}.2s, #{{-12.00000000|-1.200000e\+01}}
ret <2 x float> < float -1.2e1, float -1.2e1>
}
define <4 x float> @fmov4s() {
-;CHECK: fmov {{v[0-9]+}}.4s, #-12.00000000
+; CHECK-LABEL: fmov4s:
+; CHECK: fmov {{v[0-9]+}}.4s, #{{-12.00000000|-1.200000e\+01}}
ret <4 x float> < float -1.2e1, float -1.2e1, float -1.2e1, float -1.2e1>
}
define <2 x double> @fmov2d() {
-;CHECK: fmov {{v[0-9]+}}.2d, #-12.00000000
+; CHECK-LABEL: fmov2d:
+; CHECK: fmov {{v[0-9]+}}.2d, #{{-12.00000000|-1.200000e\+01}}
ret <2 x double> < double -1.2e1, double -1.2e1>
}
define <2 x i32> @movi1d_1() {
-; CHECK: movi d0, #0xffffffff0000
+; CHECK-LABEL: movi1d_1:
+; CHECK: movi d0, #0x{{0*}}ffffffff0000
ret <2 x i32> < i32 -65536, i32 65535>
}
declare <2 x i32> @test_movi1d(<2 x i32>, <2 x i32>)
define <2 x i32> @movi1d() {
+; CHECK-LABEL: movi1d:
; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
-; CHECK-NEXT: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
-; CHECK-NEXT: movi d1, #0xffffffff0000
+; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{#?}}:lo12:.{{[A-Z0-9_]+}}]
+; CHECK-NEXT: movi d1, #0x{{0*}}ffffffff0000
%1 = tail call <2 x i32> @test_movi1d(<2 x i32> <i32 -2147483648, i32 2147450880>, <2 x i32> <i32 -65536, i32 65535>)
ret <2 x i32> %1
}
diff --git a/test/CodeGen/AArch64/neon-mul-div.ll b/test/CodeGen/AArch64/neon-mul-div.ll
deleted file mode 100644
index da22ce8..0000000
--- a/test/CodeGen/AArch64/neon-mul-div.ll
+++ /dev/null
@@ -1,754 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-
-define <8 x i8> @mul8xi8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: mul {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
- %tmp3 = mul <8 x i8> %A, %B;
- ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @mul16xi8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: mul {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
- %tmp3 = mul <16 x i8> %A, %B;
- ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @mul4xi16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
- %tmp3 = mul <4 x i16> %A, %B;
- ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @mul8xi16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
- %tmp3 = mul <8 x i16> %A, %B;
- ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @mul2xi32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
- %tmp3 = mul <2 x i32> %A, %B;
- ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @mul4x32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
- %tmp3 = mul <4 x i32> %A, %B;
- ret <4 x i32> %tmp3
-}
-
-define <1 x i64> @mul1xi64(<1 x i64> %A, <1 x i64> %B) {
-;CHECK-LABEL: mul1xi64:
-;CHECK: mul x{{[0-9]+}}, x{{[0-9]+}}, x{{[0-9]+}}
- %tmp3 = mul <1 x i64> %A, %B;
- ret <1 x i64> %tmp3
-}
-
-define <2 x i64> @mul2xi64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK-LABEL: mul2xi64:
-;CHECK: mul x{{[0-9]+}}, x{{[0-9]+}}, x{{[0-9]+}}
-;CHECK: mul x{{[0-9]+}}, x{{[0-9]+}}, x{{[0-9]+}}
- %tmp3 = mul <2 x i64> %A, %B;
- ret <2 x i64> %tmp3
-}
-
- define <2 x float> @mul2xfloat(<2 x float> %A, <2 x float> %B) {
-;CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
- %tmp3 = fmul <2 x float> %A, %B;
- ret <2 x float> %tmp3
-}
-
-define <4 x float> @mul4xfloat(<4 x float> %A, <4 x float> %B) {
-;CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
- %tmp3 = fmul <4 x float> %A, %B;
- ret <4 x float> %tmp3
-}
-define <2 x double> @mul2xdouble(<2 x double> %A, <2 x double> %B) {
-;CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
- %tmp3 = fmul <2 x double> %A, %B;
- ret <2 x double> %tmp3
-}
-
-
- define <2 x float> @div2xfloat(<2 x float> %A, <2 x float> %B) {
-;CHECK: fdiv {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
- %tmp3 = fdiv <2 x float> %A, %B;
- ret <2 x float> %tmp3
-}
-
-define <4 x float> @div4xfloat(<4 x float> %A, <4 x float> %B) {
-;CHECK: fdiv {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
- %tmp3 = fdiv <4 x float> %A, %B;
- ret <4 x float> %tmp3
-}
-define <2 x double> @div2xdouble(<2 x double> %A, <2 x double> %B) {
-;CHECK: fdiv {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
- %tmp3 = fdiv <2 x double> %A, %B;
- ret <2 x double> %tmp3
-}
-
-define <1 x i8> @sdiv1x8(<1 x i8> %A, <1 x i8> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
- %tmp3 = sdiv <1 x i8> %A, %B;
- ret <1 x i8> %tmp3
-}
-
-define <8 x i8> @sdiv8x8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
- %tmp3 = sdiv <8 x i8> %A, %B;
- ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @sdiv16x8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
- %tmp3 = sdiv <16 x i8> %A, %B;
- ret <16 x i8> %tmp3
-}
-
-define <1 x i16> @sdiv1x16(<1 x i16> %A, <1 x i16> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
- %tmp3 = sdiv <1 x i16> %A, %B;
- ret <1 x i16> %tmp3
-}
-
-define <4 x i16> @sdiv4x16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
- %tmp3 = sdiv <4 x i16> %A, %B;
- ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @sdiv8x16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
- %tmp3 = sdiv <8 x i16> %A, %B;
- ret <8 x i16> %tmp3
-}
-
-define <1 x i32> @sdiv1x32(<1 x i32> %A, <1 x i32> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
- %tmp3 = sdiv <1 x i32> %A, %B;
- ret <1 x i32> %tmp3
-}
-
-define <2 x i32> @sdiv2x32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
- %tmp3 = sdiv <2 x i32> %A, %B;
- ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @sdiv4x32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
- %tmp3 = sdiv <4 x i32> %A, %B;
- ret <4 x i32> %tmp3
-}
-
-define <1 x i64> @sdiv1x64(<1 x i64> %A, <1 x i64> %B) {
-;CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
- %tmp3 = sdiv <1 x i64> %A, %B;
- ret <1 x i64> %tmp3
-}
-
-define <2 x i64> @sdiv2x64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-;CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
- %tmp3 = sdiv <2 x i64> %A, %B;
- ret <2 x i64> %tmp3
-}
-
-define <1 x i8> @udiv1x8(<1 x i8> %A, <1 x i8> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
- %tmp3 = udiv <1 x i8> %A, %B;
- ret <1 x i8> %tmp3
-}
-
-define <8 x i8> @udiv8x8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
- %tmp3 = udiv <8 x i8> %A, %B;
- ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @udiv16x8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
- %tmp3 = udiv <16 x i8> %A, %B;
- ret <16 x i8> %tmp3
-}
-
-define <1 x i16> @udiv1x16(<1 x i16> %A, <1 x i16> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
- %tmp3 = udiv <1 x i16> %A, %B;
- ret <1 x i16> %tmp3
-}
-
-define <4 x i16> @udiv4x16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
- %tmp3 = udiv <4 x i16> %A, %B;
- ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @udiv8x16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
- %tmp3 = udiv <8 x i16> %A, %B;
- ret <8 x i16> %tmp3
-}
-
-define <1 x i32> @udiv1x32(<1 x i32> %A, <1 x i32> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
- %tmp3 = udiv <1 x i32> %A, %B;
- ret <1 x i32> %tmp3
-}
-
-define <2 x i32> @udiv2x32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
- %tmp3 = udiv <2 x i32> %A, %B;
- ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @udiv4x32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
- %tmp3 = udiv <4 x i32> %A, %B;
- ret <4 x i32> %tmp3
-}
-
-define <1 x i64> @udiv1x64(<1 x i64> %A, <1 x i64> %B) {
-;CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
- %tmp3 = udiv <1 x i64> %A, %B;
- ret <1 x i64> %tmp3
-}
-
-define <2 x i64> @udiv2x64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-;CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
- %tmp3 = udiv <2 x i64> %A, %B;
- ret <2 x i64> %tmp3
-}
-
-define <1 x i8> @srem1x8(<1 x i8> %A, <1 x i8> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
- %tmp3 = srem <1 x i8> %A, %B;
- ret <1 x i8> %tmp3
-}
-
-define <8 x i8> @srem8x8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
- %tmp3 = srem <8 x i8> %A, %B;
- ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @srem16x8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
- %tmp3 = srem <16 x i8> %A, %B;
- ret <16 x i8> %tmp3
-}
-
-define <1 x i16> @srem1x16(<1 x i16> %A, <1 x i16> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
- %tmp3 = srem <1 x i16> %A, %B;
- ret <1 x i16> %tmp3
-}
-
-define <4 x i16> @srem4x16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
- %tmp3 = srem <4 x i16> %A, %B;
- ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @srem8x16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
- %tmp3 = srem <8 x i16> %A, %B;
- ret <8 x i16> %tmp3
-}
-
-define <1 x i32> @srem1x32(<1 x i32> %A, <1 x i32> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
- %tmp3 = srem <1 x i32> %A, %B;
- ret <1 x i32> %tmp3
-}
-
-define <2 x i32> @srem2x32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
- %tmp3 = srem <2 x i32> %A, %B;
- ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @srem4x32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
- %tmp3 = srem <4 x i32> %A, %B;
- ret <4 x i32> %tmp3
-}
-
-define <1 x i64> @srem1x64(<1 x i64> %A, <1 x i64> %B) {
-;CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-;CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
- %tmp3 = srem <1 x i64> %A, %B;
- ret <1 x i64> %tmp3
-}
-
-define <2 x i64> @srem2x64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-;CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-;CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-;CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
- %tmp3 = srem <2 x i64> %A, %B;
- ret <2 x i64> %tmp3
-}
-
-define <1 x i8> @urem1x8(<1 x i8> %A, <1 x i8> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
- %tmp3 = urem <1 x i8> %A, %B;
- ret <1 x i8> %tmp3
-}
-
-define <8 x i8> @urem8x8(<8 x i8> %A, <8 x i8> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
- %tmp3 = urem <8 x i8> %A, %B;
- ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @urem16x8(<16 x i8> %A, <16 x i8> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
- %tmp3 = urem <16 x i8> %A, %B;
- ret <16 x i8> %tmp3
-}
-
-define <1 x i16> @urem1x16(<1 x i16> %A, <1 x i16> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
- %tmp3 = urem <1 x i16> %A, %B;
- ret <1 x i16> %tmp3
-}
-
-define <4 x i16> @urem4x16(<4 x i16> %A, <4 x i16> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
- %tmp3 = urem <4 x i16> %A, %B;
- ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @urem8x16(<8 x i16> %A, <8 x i16> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
- %tmp3 = urem <8 x i16> %A, %B;
- ret <8 x i16> %tmp3
-}
-
-define <1 x i32> @urem1x32(<1 x i32> %A, <1 x i32> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
- %tmp3 = urem <1 x i32> %A, %B;
- ret <1 x i32> %tmp3
-}
-
-define <2 x i32> @urem2x32(<2 x i32> %A, <2 x i32> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
- %tmp3 = urem <2 x i32> %A, %B;
- ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @urem4x32(<4 x i32> %A, <4 x i32> %B) {
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
-;CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
- %tmp3 = urem <4 x i32> %A, %B;
- ret <4 x i32> %tmp3
-}
-
-define <1 x i64> @urem1x64(<1 x i64> %A, <1 x i64> %B) {
-;CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-;CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
- %tmp3 = urem <1 x i64> %A, %B;
- ret <1 x i64> %tmp3
-}
-
-define <2 x i64> @urem2x64(<2 x i64> %A, <2 x i64> %B) {
-;CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-;CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-;CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
-;CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
- %tmp3 = urem <2 x i64> %A, %B;
- ret <2 x i64> %tmp3
-}
-
-define <2 x float> @frem2f32(<2 x float> %A, <2 x float> %B) {
-; CHECK: bl fmodf
-; CHECK: bl fmodf
- %tmp3 = frem <2 x float> %A, %B;
- ret <2 x float> %tmp3
-}
-
-define <4 x float> @frem4f32(<4 x float> %A, <4 x float> %B) {
-; CHECK: bl fmodf
-; CHECK: bl fmodf
-; CHECK: bl fmodf
-; CHECK: bl fmodf
- %tmp3 = frem <4 x float> %A, %B;
- ret <4 x float> %tmp3
-}
-
-define <1 x double> @frem1d64(<1 x double> %A, <1 x double> %B) {
-; CHECK: bl fmod
- %tmp3 = frem <1 x double> %A, %B;
- ret <1 x double> %tmp3
-}
-
-define <2 x double> @frem2d64(<2 x double> %A, <2 x double> %B) {
-; CHECK: bl fmod
-; CHECK: bl fmod
- %tmp3 = frem <2 x double> %A, %B;
- ret <2 x double> %tmp3
-}
-
-declare <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8>, <8 x i8>)
-declare <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8>, <16 x i8>)
-
-define <8 x i8> @poly_mulv8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: poly_mulv8i8:
- %prod = call <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: pmul v0.8b, v0.8b, v1.8b
- ret <8 x i8> %prod
-}
-
-define <16 x i8> @poly_mulv16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: poly_mulv16i8:
- %prod = call <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: pmul v0.16b, v0.16b, v1.16b
- ret <16 x i8> %prod
-}
-
-declare <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16>, <4 x i16>)
-declare <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16>, <8 x i16>)
-declare <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32>, <2 x i32>)
-declare <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i16> @test_sqdmulh_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_sqdmulh_v4i16:
- %prod = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: sqdmulh v0.4h, v0.4h, v1.4h
- ret <4 x i16> %prod
-}
-
-define <8 x i16> @test_sqdmulh_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_sqdmulh_v8i16:
- %prod = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: sqdmulh v0.8h, v0.8h, v1.8h
- ret <8 x i16> %prod
-}
-
-define <2 x i32> @test_sqdmulh_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_sqdmulh_v2i32:
- %prod = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: sqdmulh v0.2s, v0.2s, v1.2s
- ret <2 x i32> %prod
-}
-
-define <4 x i32> @test_sqdmulh_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_sqdmulh_v4i32:
- %prod = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: sqdmulh v0.4s, v0.4s, v1.4s
- ret <4 x i32> %prod
-}
-
-declare <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16>, <4 x i16>)
-declare <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16>, <8 x i16>)
-declare <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32>, <2 x i32>)
-declare <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i16> @test_sqrdmulh_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_sqrdmulh_v4i16:
- %prod = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: sqrdmulh v0.4h, v0.4h, v1.4h
- ret <4 x i16> %prod
-}
-
-define <8 x i16> @test_sqrdmulh_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_sqrdmulh_v8i16:
- %prod = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: sqrdmulh v0.8h, v0.8h, v1.8h
- ret <8 x i16> %prod
-}
-
-define <2 x i32> @test_sqrdmulh_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_sqrdmulh_v2i32:
- %prod = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: sqrdmulh v0.2s, v0.2s, v1.2s
- ret <2 x i32> %prod
-}
-
-define <4 x i32> @test_sqrdmulh_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_sqrdmulh_v4i32:
- %prod = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: sqrdmulh v0.4s, v0.4s, v1.4s
- ret <4 x i32> %prod
-}
-
-declare <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @fmulx_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; Using registers other than v0, v1 and v2 are possible, but would be odd.
-; CHECK: fmulx v0.2s, v0.2s, v1.2s
- %val = call <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float> %lhs, <2 x float> %rhs)
- ret <2 x float> %val
-}
-
-define <4 x float> @fmulx_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; Using registers other than v0, v1 and v2 are possible, but would be odd.
-; CHECK: fmulx v0.4s, v0.4s, v1.4s
- %val = call <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float> %lhs, <4 x float> %rhs)
- ret <4 x float> %val
-}
-
-define <2 x double> @fmulx_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; Using registers other than v0, v1 and v2 are possible, but would be odd.
-; CHECK: fmulx v0.2d, v0.2d, v1.2d
- %val = call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %lhs, <2 x double> %rhs)
- ret <2 x double> %val
-}
-
-define <1 x i8> @test_mul_v1i8(<1 x i8> %a, <1 x i8> %b) {
-;CHECK-LABEL: test_mul_v1i8:
-;CHECK: mul {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
- %c = mul <1 x i8> %a, %b
- ret <1 x i8> %c
-}
-
-define <1 x i16> @test_mul_v1i16(<1 x i16> %a, <1 x i16> %b) {
-;CHECK-LABEL: test_mul_v1i16:
-;CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
- %c = mul <1 x i16> %a, %b
- ret <1 x i16> %c
-}
-
-define <1 x i32> @test_mul_v1i32(<1 x i32> %a, <1 x i32> %b) {
-;CHECK-LABEL: test_mul_v1i32:
-;CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
- %c = mul <1 x i32> %a, %b
- ret <1 x i32> %c
-}
diff --git a/test/CodeGen/AArch64/neon-perm.ll b/test/CodeGen/AArch64/neon-perm.ll
index a0b17e1..4f8571d 100644
--- a/test/CodeGen/AArch64/neon-perm.ll
+++ b/test/CodeGen/AArch64/neon-perm.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK
%struct.int8x8x2_t = type { [2 x <8 x i8>] }
%struct.int16x4x2_t = type { [2 x <4 x i16>] }
@@ -20,7 +20,7 @@
%struct.poly16x8x2_t = type { [2 x <8 x i16>] }
define <8 x i8> @test_vuzp1_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vuzp1_s8:
+; CHECK-LABEL: test_vuzp1_s8:
; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -28,7 +28,7 @@ entry:
}
define <16 x i8> @test_vuzp1q_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vuzp1q_s8:
+; CHECK-LABEL: test_vuzp1q_s8:
; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -36,7 +36,7 @@ entry:
}
define <4 x i16> @test_vuzp1_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vuzp1_s16:
+; CHECK-LABEL: test_vuzp1_s16:
; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -44,7 +44,7 @@ entry:
}
define <8 x i16> @test_vuzp1q_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vuzp1q_s16:
+; CHECK-LABEL: test_vuzp1q_s16:
; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -52,15 +52,15 @@ entry:
}
define <2 x i32> @test_vuzp1_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vuzp1_s32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK-LABEL: test_vuzp1_s32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
entry:
%shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
ret <2 x i32> %shuffle.i
}
define <4 x i32> @test_vuzp1q_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vuzp1q_s32:
+; CHECK-LABEL: test_vuzp1q_s32:
; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -68,15 +68,15 @@ entry:
}
define <2 x i64> @test_vuzp1q_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vuzp1q_s64:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK-LABEL: test_vuzp1q_s64:
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
entry:
%shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
ret <2 x i64> %shuffle.i
}
define <8 x i8> @test_vuzp1_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vuzp1_u8:
+; CHECK-LABEL: test_vuzp1_u8:
; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -84,7 +84,7 @@ entry:
}
define <16 x i8> @test_vuzp1q_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vuzp1q_u8:
+; CHECK-LABEL: test_vuzp1q_u8:
; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -92,7 +92,7 @@ entry:
}
define <4 x i16> @test_vuzp1_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vuzp1_u16:
+; CHECK-LABEL: test_vuzp1_u16:
; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -100,7 +100,7 @@ entry:
}
define <8 x i16> @test_vuzp1q_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vuzp1q_u16:
+; CHECK-LABEL: test_vuzp1q_u16:
; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -108,15 +108,15 @@ entry:
}
define <2 x i32> @test_vuzp1_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vuzp1_u32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK-LABEL: test_vuzp1_u32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
entry:
%shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
ret <2 x i32> %shuffle.i
}
define <4 x i32> @test_vuzp1q_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vuzp1q_u32:
+; CHECK-LABEL: test_vuzp1q_u32:
; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -124,23 +124,23 @@ entry:
}
define <2 x i64> @test_vuzp1q_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vuzp1q_u64:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK-LABEL: test_vuzp1q_u64:
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
entry:
%shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
ret <2 x i64> %shuffle.i
}
define <2 x float> @test_vuzp1_f32(<2 x float> %a, <2 x float> %b) {
-; CHECK: test_vuzp1_f32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK-LABEL: test_vuzp1_f32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
entry:
%shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
ret <2 x float> %shuffle.i
}
define <4 x float> @test_vuzp1q_f32(<4 x float> %a, <4 x float> %b) {
-; CHECK: test_vuzp1q_f32:
+; CHECK-LABEL: test_vuzp1q_f32:
; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -148,15 +148,15 @@ entry:
}
define <2 x double> @test_vuzp1q_f64(<2 x double> %a, <2 x double> %b) {
-; CHECK: test_vuzp1q_f64:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK-LABEL: test_vuzp1q_f64:
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
entry:
%shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
ret <2 x double> %shuffle.i
}
define <8 x i8> @test_vuzp1_p8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vuzp1_p8:
+; CHECK-LABEL: test_vuzp1_p8:
; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -164,7 +164,7 @@ entry:
}
define <16 x i8> @test_vuzp1q_p8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vuzp1q_p8:
+; CHECK-LABEL: test_vuzp1q_p8:
; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -172,7 +172,7 @@ entry:
}
define <4 x i16> @test_vuzp1_p16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vuzp1_p16:
+; CHECK-LABEL: test_vuzp1_p16:
; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -180,7 +180,7 @@ entry:
}
define <8 x i16> @test_vuzp1q_p16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vuzp1q_p16:
+; CHECK-LABEL: test_vuzp1q_p16:
; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -188,7 +188,7 @@ entry:
}
define <8 x i8> @test_vuzp2_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vuzp2_s8:
+; CHECK-LABEL: test_vuzp2_s8:
; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -196,7 +196,7 @@ entry:
}
define <16 x i8> @test_vuzp2q_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vuzp2q_s8:
+; CHECK-LABEL: test_vuzp2q_s8:
; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
@@ -204,7 +204,7 @@ entry:
}
define <4 x i16> @test_vuzp2_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vuzp2_s16:
+; CHECK-LABEL: test_vuzp2_s16:
; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -212,7 +212,7 @@ entry:
}
define <8 x i16> @test_vuzp2q_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vuzp2q_s16:
+; CHECK-LABEL: test_vuzp2q_s16:
; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -220,15 +220,15 @@ entry:
}
define <2 x i32> @test_vuzp2_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vuzp2_s32:
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vuzp2_s32:
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
entry:
%shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
ret <2 x i32> %shuffle.i
}
define <4 x i32> @test_vuzp2q_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vuzp2q_s32:
+; CHECK-LABEL: test_vuzp2q_s32:
; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -236,16 +236,15 @@ entry:
}
define <2 x i64> @test_vuzp2q_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vuzp2q_s64:
-; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
-; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: test_vuzp2q_s64:
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
entry:
%shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
ret <2 x i64> %shuffle.i
}
define <8 x i8> @test_vuzp2_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vuzp2_u8:
+; CHECK-LABEL: test_vuzp2_u8:
; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -253,7 +252,7 @@ entry:
}
define <16 x i8> @test_vuzp2q_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vuzp2q_u8:
+; CHECK-LABEL: test_vuzp2q_u8:
; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
@@ -261,7 +260,7 @@ entry:
}
define <4 x i16> @test_vuzp2_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vuzp2_u16:
+; CHECK-LABEL: test_vuzp2_u16:
; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -269,7 +268,7 @@ entry:
}
define <8 x i16> @test_vuzp2q_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vuzp2q_u16:
+; CHECK-LABEL: test_vuzp2q_u16:
; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -277,15 +276,15 @@ entry:
}
define <2 x i32> @test_vuzp2_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vuzp2_u32:
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vuzp2_u32:
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
entry:
%shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
ret <2 x i32> %shuffle.i
}
define <4 x i32> @test_vuzp2q_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vuzp2q_u32:
+; CHECK-LABEL: test_vuzp2q_u32:
; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -293,24 +292,23 @@ entry:
}
define <2 x i64> @test_vuzp2q_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vuzp2q_u64:
-; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
-; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: test_vuzp2q_u64:
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
entry:
%shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
ret <2 x i64> %shuffle.i
}
define <2 x float> @test_vuzp2_f32(<2 x float> %a, <2 x float> %b) {
-; CHECK: test_vuzp2_f32:
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vuzp2_f32:
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
entry:
%shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
ret <2 x float> %shuffle.i
}
define <4 x float> @test_vuzp2q_f32(<4 x float> %a, <4 x float> %b) {
-; CHECK: test_vuzp2q_f32:
+; CHECK-LABEL: test_vuzp2q_f32:
; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -318,16 +316,15 @@ entry:
}
define <2 x double> @test_vuzp2q_f64(<2 x double> %a, <2 x double> %b) {
-; CHECK: test_vuzp2q_f64:
-; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
-; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: test_vuzp2q_f64:
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
entry:
%shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
ret <2 x double> %shuffle.i
}
define <8 x i8> @test_vuzp2_p8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vuzp2_p8:
+; CHECK-LABEL: test_vuzp2_p8:
; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -335,7 +332,7 @@ entry:
}
define <16 x i8> @test_vuzp2q_p8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vuzp2q_p8:
+; CHECK-LABEL: test_vuzp2q_p8:
; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
@@ -343,7 +340,7 @@ entry:
}
define <4 x i16> @test_vuzp2_p16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vuzp2_p16:
+; CHECK-LABEL: test_vuzp2_p16:
; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -351,7 +348,7 @@ entry:
}
define <8 x i16> @test_vuzp2q_p16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vuzp2q_p16:
+; CHECK-LABEL: test_vuzp2q_p16:
; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -359,7 +356,7 @@ entry:
}
define <8 x i8> @test_vzip1_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vzip1_s8:
+; CHECK-LABEL: test_vzip1_s8:
; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -367,7 +364,7 @@ entry:
}
define <16 x i8> @test_vzip1q_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vzip1q_s8:
+; CHECK-LABEL: test_vzip1q_s8:
; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
@@ -375,7 +372,7 @@ entry:
}
define <4 x i16> @test_vzip1_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vzip1_s16:
+; CHECK-LABEL: test_vzip1_s16:
; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -383,7 +380,7 @@ entry:
}
define <8 x i16> @test_vzip1q_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vzip1q_s16:
+; CHECK-LABEL: test_vzip1q_s16:
; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -391,15 +388,15 @@ entry:
}
define <2 x i32> @test_vzip1_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vzip1_s32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK-LABEL: test_vzip1_s32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
entry:
%shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
ret <2 x i32> %shuffle.i
}
define <4 x i32> @test_vzip1q_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vzip1q_s32:
+; CHECK-LABEL: test_vzip1q_s32:
; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -407,15 +404,15 @@ entry:
}
define <2 x i64> @test_vzip1q_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vzip1q_s64:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK-LABEL: test_vzip1q_s64:
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
entry:
%shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
ret <2 x i64> %shuffle.i
}
define <8 x i8> @test_vzip1_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vzip1_u8:
+; CHECK-LABEL: test_vzip1_u8:
; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -423,7 +420,7 @@ entry:
}
define <16 x i8> @test_vzip1q_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vzip1q_u8:
+; CHECK-LABEL: test_vzip1q_u8:
; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
@@ -431,7 +428,7 @@ entry:
}
define <4 x i16> @test_vzip1_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vzip1_u16:
+; CHECK-LABEL: test_vzip1_u16:
; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -439,7 +436,7 @@ entry:
}
define <8 x i16> @test_vzip1q_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vzip1q_u16:
+; CHECK-LABEL: test_vzip1q_u16:
; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -447,15 +444,15 @@ entry:
}
define <2 x i32> @test_vzip1_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vzip1_u32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK-LABEL: test_vzip1_u32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
entry:
%shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
ret <2 x i32> %shuffle.i
}
define <4 x i32> @test_vzip1q_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vzip1q_u32:
+; CHECK-LABEL: test_vzip1q_u32:
; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -463,23 +460,23 @@ entry:
}
define <2 x i64> @test_vzip1q_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vzip1q_u64:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK-LABEL: test_vzip1q_u64:
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
entry:
%shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
ret <2 x i64> %shuffle.i
}
define <2 x float> @test_vzip1_f32(<2 x float> %a, <2 x float> %b) {
-; CHECK: test_vzip1_f32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK-LABEL: test_vzip1_f32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
entry:
%shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
ret <2 x float> %shuffle.i
}
define <4 x float> @test_vzip1q_f32(<4 x float> %a, <4 x float> %b) {
-; CHECK: test_vzip1q_f32:
+; CHECK-LABEL: test_vzip1q_f32:
; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -487,15 +484,15 @@ entry:
}
define <2 x double> @test_vzip1q_f64(<2 x double> %a, <2 x double> %b) {
-; CHECK: test_vzip1q_f64:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK-LABEL: test_vzip1q_f64:
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
entry:
%shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
ret <2 x double> %shuffle.i
}
define <8 x i8> @test_vzip1_p8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vzip1_p8:
+; CHECK-LABEL: test_vzip1_p8:
; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -503,7 +500,7 @@ entry:
}
define <16 x i8> @test_vzip1q_p8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vzip1q_p8:
+; CHECK-LABEL: test_vzip1q_p8:
; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
@@ -511,7 +508,7 @@ entry:
}
define <4 x i16> @test_vzip1_p16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vzip1_p16:
+; CHECK-LABEL: test_vzip1_p16:
; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -519,7 +516,7 @@ entry:
}
define <8 x i16> @test_vzip1q_p16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vzip1q_p16:
+; CHECK-LABEL: test_vzip1q_p16:
; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -527,7 +524,7 @@ entry:
}
define <8 x i8> @test_vzip2_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vzip2_s8:
+; CHECK-LABEL: test_vzip2_s8:
; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -535,7 +532,7 @@ entry:
}
define <16 x i8> @test_vzip2q_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vzip2q_s8:
+; CHECK-LABEL: test_vzip2q_s8:
; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
@@ -543,7 +540,7 @@ entry:
}
define <4 x i16> @test_vzip2_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vzip2_s16:
+; CHECK-LABEL: test_vzip2_s16:
; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -551,7 +548,7 @@ entry:
}
define <8 x i16> @test_vzip2q_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vzip2q_s16:
+; CHECK-LABEL: test_vzip2q_s16:
; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -559,15 +556,15 @@ entry:
}
define <2 x i32> @test_vzip2_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vzip2_s32:
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vzip2_s32:
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
entry:
%shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
ret <2 x i32> %shuffle.i
}
define <4 x i32> @test_vzip2q_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vzip2q_s32:
+; CHECK-LABEL: test_vzip2q_s32:
; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -575,15 +572,15 @@ entry:
}
define <2 x i64> @test_vzip2q_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vzip2q_s64:
-; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+; CHECK-LABEL: test_vzip2q_s64:
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
entry:
%shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
ret <2 x i64> %shuffle.i
}
define <8 x i8> @test_vzip2_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vzip2_u8:
+; CHECK-LABEL: test_vzip2_u8:
; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -591,7 +588,7 @@ entry:
}
define <16 x i8> @test_vzip2q_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vzip2q_u8:
+; CHECK-LABEL: test_vzip2q_u8:
; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
@@ -599,7 +596,7 @@ entry:
}
define <4 x i16> @test_vzip2_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vzip2_u16:
+; CHECK-LABEL: test_vzip2_u16:
; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -607,7 +604,7 @@ entry:
}
define <8 x i16> @test_vzip2q_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vzip2q_u16:
+; CHECK-LABEL: test_vzip2q_u16:
; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -615,15 +612,15 @@ entry:
}
define <2 x i32> @test_vzip2_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vzip2_u32:
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vzip2_u32:
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
entry:
%shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
ret <2 x i32> %shuffle.i
}
define <4 x i32> @test_vzip2q_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vzip2q_u32:
+; CHECK-LABEL: test_vzip2q_u32:
; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -631,23 +628,23 @@ entry:
}
define <2 x i64> @test_vzip2q_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vzip2q_u64:
-; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+; CHECK-LABEL: test_vzip2q_u64:
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
entry:
%shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
ret <2 x i64> %shuffle.i
}
define <2 x float> @test_vzip2_f32(<2 x float> %a, <2 x float> %b) {
-; CHECK: test_vzip2_f32:
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vzip2_f32:
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
entry:
%shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
ret <2 x float> %shuffle.i
}
define <4 x float> @test_vzip2q_f32(<4 x float> %a, <4 x float> %b) {
-; CHECK: test_vzip2q_f32:
+; CHECK-LABEL: test_vzip2q_f32:
; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -655,15 +652,15 @@ entry:
}
define <2 x double> @test_vzip2q_f64(<2 x double> %a, <2 x double> %b) {
-; CHECK: test_vzip2q_f64:
-; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+; CHECK-LABEL: test_vzip2q_f64:
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
entry:
%shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
ret <2 x double> %shuffle.i
}
define <8 x i8> @test_vzip2_p8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vzip2_p8:
+; CHECK-LABEL: test_vzip2_p8:
; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -671,7 +668,7 @@ entry:
}
define <16 x i8> @test_vzip2q_p8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vzip2q_p8:
+; CHECK-LABEL: test_vzip2q_p8:
; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
@@ -679,7 +676,7 @@ entry:
}
define <4 x i16> @test_vzip2_p16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vzip2_p16:
+; CHECK-LABEL: test_vzip2_p16:
; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -687,7 +684,7 @@ entry:
}
define <8 x i16> @test_vzip2q_p16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vzip2q_p16:
+; CHECK-LABEL: test_vzip2q_p16:
; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -695,7 +692,7 @@ entry:
}
define <8 x i8> @test_vtrn1_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vtrn1_s8:
+; CHECK-LABEL: test_vtrn1_s8:
; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -703,7 +700,7 @@ entry:
}
define <16 x i8> @test_vtrn1q_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vtrn1q_s8:
+; CHECK-LABEL: test_vtrn1q_s8:
; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
@@ -711,7 +708,7 @@ entry:
}
define <4 x i16> @test_vtrn1_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vtrn1_s16:
+; CHECK-LABEL: test_vtrn1_s16:
; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -719,7 +716,7 @@ entry:
}
define <8 x i16> @test_vtrn1q_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vtrn1q_s16:
+; CHECK-LABEL: test_vtrn1q_s16:
; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -727,15 +724,15 @@ entry:
}
define <2 x i32> @test_vtrn1_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vtrn1_s32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK-LABEL: test_vtrn1_s32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
entry:
%shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
ret <2 x i32> %shuffle.i
}
define <4 x i32> @test_vtrn1q_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vtrn1q_s32:
+; CHECK-LABEL: test_vtrn1q_s32:
; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -743,15 +740,15 @@ entry:
}
define <2 x i64> @test_vtrn1q_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vtrn1q_s64:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK-LABEL: test_vtrn1q_s64:
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
entry:
%shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
ret <2 x i64> %shuffle.i
}
define <8 x i8> @test_vtrn1_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vtrn1_u8:
+; CHECK-LABEL: test_vtrn1_u8:
; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -759,7 +756,7 @@ entry:
}
define <16 x i8> @test_vtrn1q_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vtrn1q_u8:
+; CHECK-LABEL: test_vtrn1q_u8:
; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
@@ -767,7 +764,7 @@ entry:
}
define <4 x i16> @test_vtrn1_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vtrn1_u16:
+; CHECK-LABEL: test_vtrn1_u16:
; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -775,7 +772,7 @@ entry:
}
define <8 x i16> @test_vtrn1q_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vtrn1q_u16:
+; CHECK-LABEL: test_vtrn1q_u16:
; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -783,15 +780,15 @@ entry:
}
define <2 x i32> @test_vtrn1_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vtrn1_u32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK-LABEL: test_vtrn1_u32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
entry:
%shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
ret <2 x i32> %shuffle.i
}
define <4 x i32> @test_vtrn1q_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vtrn1q_u32:
+; CHECK-LABEL: test_vtrn1q_u32:
; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -799,23 +796,23 @@ entry:
}
define <2 x i64> @test_vtrn1q_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vtrn1q_u64:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK-LABEL: test_vtrn1q_u64:
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
entry:
%shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
ret <2 x i64> %shuffle.i
}
define <2 x float> @test_vtrn1_f32(<2 x float> %a, <2 x float> %b) {
-; CHECK: test_vtrn1_f32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK-LABEL: test_vtrn1_f32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
entry:
%shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
ret <2 x float> %shuffle.i
}
define <4 x float> @test_vtrn1q_f32(<4 x float> %a, <4 x float> %b) {
-; CHECK: test_vtrn1q_f32:
+; CHECK-LABEL: test_vtrn1q_f32:
; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -823,15 +820,15 @@ entry:
}
define <2 x double> @test_vtrn1q_f64(<2 x double> %a, <2 x double> %b) {
-; CHECK: test_vtrn1q_f64:
-; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+; CHECK-LABEL: test_vtrn1q_f64:
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
entry:
%shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
ret <2 x double> %shuffle.i
}
define <8 x i8> @test_vtrn1_p8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vtrn1_p8:
+; CHECK-LABEL: test_vtrn1_p8:
; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -839,7 +836,7 @@ entry:
}
define <16 x i8> @test_vtrn1q_p8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vtrn1q_p8:
+; CHECK-LABEL: test_vtrn1q_p8:
; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
@@ -847,7 +844,7 @@ entry:
}
define <4 x i16> @test_vtrn1_p16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vtrn1_p16:
+; CHECK-LABEL: test_vtrn1_p16:
; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -855,7 +852,7 @@ entry:
}
define <8 x i16> @test_vtrn1q_p16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vtrn1q_p16:
+; CHECK-LABEL: test_vtrn1q_p16:
; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -863,7 +860,7 @@ entry:
}
define <8 x i8> @test_vtrn2_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vtrn2_s8:
+; CHECK-LABEL: test_vtrn2_s8:
; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -871,7 +868,7 @@ entry:
}
define <16 x i8> @test_vtrn2q_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vtrn2q_s8:
+; CHECK-LABEL: test_vtrn2q_s8:
; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
@@ -879,7 +876,7 @@ entry:
}
define <4 x i16> @test_vtrn2_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vtrn2_s16:
+; CHECK-LABEL: test_vtrn2_s16:
; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -887,7 +884,7 @@ entry:
}
define <8 x i16> @test_vtrn2q_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vtrn2q_s16:
+; CHECK-LABEL: test_vtrn2q_s16:
; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -895,15 +892,15 @@ entry:
}
define <2 x i32> @test_vtrn2_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vtrn2_s32:
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vtrn2_s32:
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
entry:
%shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
ret <2 x i32> %shuffle.i
}
define <4 x i32> @test_vtrn2q_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vtrn2q_s32:
+; CHECK-LABEL: test_vtrn2q_s32:
; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -911,15 +908,15 @@ entry:
}
define <2 x i64> @test_vtrn2q_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vtrn2q_s64:
-; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+; CHECK-LABEL: test_vtrn2q_s64:
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
entry:
%shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
ret <2 x i64> %shuffle.i
}
define <8 x i8> @test_vtrn2_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vtrn2_u8:
+; CHECK-LABEL: test_vtrn2_u8:
; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -927,7 +924,7 @@ entry:
}
define <16 x i8> @test_vtrn2q_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vtrn2q_u8:
+; CHECK-LABEL: test_vtrn2q_u8:
; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
@@ -935,7 +932,7 @@ entry:
}
define <4 x i16> @test_vtrn2_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vtrn2_u16:
+; CHECK-LABEL: test_vtrn2_u16:
; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -943,7 +940,7 @@ entry:
}
define <8 x i16> @test_vtrn2q_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vtrn2q_u16:
+; CHECK-LABEL: test_vtrn2q_u16:
; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -951,15 +948,15 @@ entry:
}
define <2 x i32> @test_vtrn2_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vtrn2_u32:
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vtrn2_u32:
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
entry:
%shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
ret <2 x i32> %shuffle.i
}
define <4 x i32> @test_vtrn2q_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vtrn2q_u32:
+; CHECK-LABEL: test_vtrn2q_u32:
; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -967,23 +964,23 @@ entry:
}
define <2 x i64> @test_vtrn2q_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vtrn2q_u64:
-; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+; CHECK-LABEL: test_vtrn2q_u64:
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
entry:
%shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
ret <2 x i64> %shuffle.i
}
define <2 x float> @test_vtrn2_f32(<2 x float> %a, <2 x float> %b) {
-; CHECK: test_vtrn2_f32:
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vtrn2_f32:
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
entry:
%shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
ret <2 x float> %shuffle.i
}
define <4 x float> @test_vtrn2q_f32(<4 x float> %a, <4 x float> %b) {
-; CHECK: test_vtrn2q_f32:
+; CHECK-LABEL: test_vtrn2q_f32:
; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -991,15 +988,15 @@ entry:
}
define <2 x double> @test_vtrn2q_f64(<2 x double> %a, <2 x double> %b) {
-; CHECK: test_vtrn2q_f64:
-; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+; CHECK-LABEL: test_vtrn2q_f64:
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
entry:
%shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
ret <2 x double> %shuffle.i
}
define <8 x i8> @test_vtrn2_p8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vtrn2_p8:
+; CHECK-LABEL: test_vtrn2_p8:
; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -1007,7 +1004,7 @@ entry:
}
define <16 x i8> @test_vtrn2q_p8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vtrn2q_p8:
+; CHECK-LABEL: test_vtrn2q_p8:
; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
@@ -1015,7 +1012,7 @@ entry:
}
define <4 x i16> @test_vtrn2_p16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vtrn2_p16:
+; CHECK-LABEL: test_vtrn2_p16:
; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -1023,7 +1020,7 @@ entry:
}
define <8 x i16> @test_vtrn2q_p16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vtrn2q_p16:
+; CHECK-LABEL: test_vtrn2q_p16:
; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -1031,7 +1028,7 @@ entry:
}
define <8 x i8> @test_same_vuzp1_s8(<8 x i8> %a) {
-; CHECK: test_same_vuzp1_s8:
+; CHECK-LABEL: test_same_vuzp1_s8:
; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -1039,7 +1036,7 @@ entry:
}
define <16 x i8> @test_same_vuzp1q_s8(<16 x i8> %a) {
-; CHECK: test_same_vuzp1q_s8:
+; CHECK-LABEL: test_same_vuzp1q_s8:
; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -1047,7 +1044,7 @@ entry:
}
define <4 x i16> @test_same_vuzp1_s16(<4 x i16> %a) {
-; CHECK: test_same_vuzp1_s16:
+; CHECK-LABEL: test_same_vuzp1_s16:
; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1055,7 +1052,7 @@ entry:
}
define <8 x i16> @test_same_vuzp1q_s16(<8 x i16> %a) {
-; CHECK: test_same_vuzp1q_s16:
+; CHECK-LABEL: test_same_vuzp1q_s16:
; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -1063,7 +1060,7 @@ entry:
}
define <4 x i32> @test_same_vuzp1q_s32(<4 x i32> %a) {
-; CHECK: test_same_vuzp1q_s32:
+; CHECK-LABEL: test_same_vuzp1q_s32:
; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1071,7 +1068,7 @@ entry:
}
define <8 x i8> @test_same_vuzp1_u8(<8 x i8> %a) {
-; CHECK: test_same_vuzp1_u8:
+; CHECK-LABEL: test_same_vuzp1_u8:
; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -1079,7 +1076,7 @@ entry:
}
define <16 x i8> @test_same_vuzp1q_u8(<16 x i8> %a) {
-; CHECK: test_same_vuzp1q_u8:
+; CHECK-LABEL: test_same_vuzp1q_u8:
; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -1087,7 +1084,7 @@ entry:
}
define <4 x i16> @test_same_vuzp1_u16(<4 x i16> %a) {
-; CHECK: test_same_vuzp1_u16:
+; CHECK-LABEL: test_same_vuzp1_u16:
; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1095,7 +1092,7 @@ entry:
}
define <8 x i16> @test_same_vuzp1q_u16(<8 x i16> %a) {
-; CHECK: test_same_vuzp1q_u16:
+; CHECK-LABEL: test_same_vuzp1q_u16:
; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -1103,7 +1100,7 @@ entry:
}
define <4 x i32> @test_same_vuzp1q_u32(<4 x i32> %a) {
-; CHECK: test_same_vuzp1q_u32:
+; CHECK-LABEL: test_same_vuzp1q_u32:
; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1111,7 +1108,7 @@ entry:
}
define <4 x float> @test_same_vuzp1q_f32(<4 x float> %a) {
-; CHECK: test_same_vuzp1q_f32:
+; CHECK-LABEL: test_same_vuzp1q_f32:
; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1119,7 +1116,7 @@ entry:
}
define <8 x i8> @test_same_vuzp1_p8(<8 x i8> %a) {
-; CHECK: test_same_vuzp1_p8:
+; CHECK-LABEL: test_same_vuzp1_p8:
; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -1127,7 +1124,7 @@ entry:
}
define <16 x i8> @test_same_vuzp1q_p8(<16 x i8> %a) {
-; CHECK: test_same_vuzp1q_p8:
+; CHECK-LABEL: test_same_vuzp1q_p8:
; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -1135,7 +1132,7 @@ entry:
}
define <4 x i16> @test_same_vuzp1_p16(<4 x i16> %a) {
-; CHECK: test_same_vuzp1_p16:
+; CHECK-LABEL: test_same_vuzp1_p16:
; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1143,7 +1140,7 @@ entry:
}
define <8 x i16> @test_same_vuzp1q_p16(<8 x i16> %a) {
-; CHECK: test_same_vuzp1q_p16:
+; CHECK-LABEL: test_same_vuzp1q_p16:
; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -1151,7 +1148,7 @@ entry:
}
define <8 x i8> @test_same_vuzp2_s8(<8 x i8> %a) {
-; CHECK: test_same_vuzp2_s8:
+; CHECK-LABEL: test_same_vuzp2_s8:
; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -1159,7 +1156,7 @@ entry:
}
define <16 x i8> @test_same_vuzp2q_s8(<16 x i8> %a) {
-; CHECK: test_same_vuzp2q_s8:
+; CHECK-LABEL: test_same_vuzp2q_s8:
; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
@@ -1167,7 +1164,7 @@ entry:
}
define <4 x i16> @test_same_vuzp2_s16(<4 x i16> %a) {
-; CHECK: test_same_vuzp2_s16:
+; CHECK-LABEL: test_same_vuzp2_s16:
; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -1175,7 +1172,7 @@ entry:
}
define <8 x i16> @test_same_vuzp2q_s16(<8 x i16> %a) {
-; CHECK: test_same_vuzp2q_s16:
+; CHECK-LABEL: test_same_vuzp2q_s16:
; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -1183,7 +1180,7 @@ entry:
}
define <4 x i32> @test_same_vuzp2q_s32(<4 x i32> %a) {
-; CHECK: test_same_vuzp2q_s32:
+; CHECK-LABEL: test_same_vuzp2q_s32:
; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -1191,7 +1188,7 @@ entry:
}
define <8 x i8> @test_same_vuzp2_u8(<8 x i8> %a) {
-; CHECK: test_same_vuzp2_u8:
+; CHECK-LABEL: test_same_vuzp2_u8:
; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -1199,7 +1196,7 @@ entry:
}
define <16 x i8> @test_same_vuzp2q_u8(<16 x i8> %a) {
-; CHECK: test_same_vuzp2q_u8:
+; CHECK-LABEL: test_same_vuzp2q_u8:
; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
@@ -1207,7 +1204,7 @@ entry:
}
define <4 x i16> @test_same_vuzp2_u16(<4 x i16> %a) {
-; CHECK: test_same_vuzp2_u16:
+; CHECK-LABEL: test_same_vuzp2_u16:
; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -1215,7 +1212,7 @@ entry:
}
define <8 x i16> @test_same_vuzp2q_u16(<8 x i16> %a) {
-; CHECK: test_same_vuzp2q_u16:
+; CHECK-LABEL: test_same_vuzp2q_u16:
; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -1223,7 +1220,7 @@ entry:
}
define <4 x i32> @test_same_vuzp2q_u32(<4 x i32> %a) {
-; CHECK: test_same_vuzp2q_u32:
+; CHECK-LABEL: test_same_vuzp2q_u32:
; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -1231,7 +1228,7 @@ entry:
}
define <4 x float> @test_same_vuzp2q_f32(<4 x float> %a) {
-; CHECK: test_same_vuzp2q_f32:
+; CHECK-LABEL: test_same_vuzp2q_f32:
; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -1239,7 +1236,7 @@ entry:
}
define <8 x i8> @test_same_vuzp2_p8(<8 x i8> %a) {
-; CHECK: test_same_vuzp2_p8:
+; CHECK-LABEL: test_same_vuzp2_p8:
; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -1247,7 +1244,7 @@ entry:
}
define <16 x i8> @test_same_vuzp2q_p8(<16 x i8> %a) {
-; CHECK: test_same_vuzp2q_p8:
+; CHECK-LABEL: test_same_vuzp2q_p8:
; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
@@ -1255,7 +1252,7 @@ entry:
}
define <4 x i16> @test_same_vuzp2_p16(<4 x i16> %a) {
-; CHECK: test_same_vuzp2_p16:
+; CHECK-LABEL: test_same_vuzp2_p16:
; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -1263,7 +1260,7 @@ entry:
}
define <8 x i16> @test_same_vuzp2q_p16(<8 x i16> %a) {
-; CHECK: test_same_vuzp2q_p16:
+; CHECK-LABEL: test_same_vuzp2q_p16:
; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -1271,7 +1268,7 @@ entry:
}
define <8 x i8> @test_same_vzip1_s8(<8 x i8> %a) {
-; CHECK: test_same_vzip1_s8:
+; CHECK-LABEL: test_same_vzip1_s8:
; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -1279,7 +1276,7 @@ entry:
}
define <16 x i8> @test_same_vzip1q_s8(<16 x i8> %a) {
-; CHECK: test_same_vzip1q_s8:
+; CHECK-LABEL: test_same_vzip1q_s8:
; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
@@ -1287,7 +1284,7 @@ entry:
}
define <4 x i16> @test_same_vzip1_s16(<4 x i16> %a) {
-; CHECK: test_same_vzip1_s16:
+; CHECK-LABEL: test_same_vzip1_s16:
; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -1295,7 +1292,7 @@ entry:
}
define <8 x i16> @test_same_vzip1q_s16(<8 x i16> %a) {
-; CHECK: test_same_vzip1q_s16:
+; CHECK-LABEL: test_same_vzip1q_s16:
; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -1303,7 +1300,7 @@ entry:
}
define <4 x i32> @test_same_vzip1q_s32(<4 x i32> %a) {
-; CHECK: test_same_vzip1q_s32:
+; CHECK-LABEL: test_same_vzip1q_s32:
; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -1311,7 +1308,7 @@ entry:
}
define <8 x i8> @test_same_vzip1_u8(<8 x i8> %a) {
-; CHECK: test_same_vzip1_u8:
+; CHECK-LABEL: test_same_vzip1_u8:
; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -1319,7 +1316,7 @@ entry:
}
define <16 x i8> @test_same_vzip1q_u8(<16 x i8> %a) {
-; CHECK: test_same_vzip1q_u8:
+; CHECK-LABEL: test_same_vzip1q_u8:
; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
@@ -1327,7 +1324,7 @@ entry:
}
define <4 x i16> @test_same_vzip1_u16(<4 x i16> %a) {
-; CHECK: test_same_vzip1_u16:
+; CHECK-LABEL: test_same_vzip1_u16:
; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -1335,7 +1332,7 @@ entry:
}
define <8 x i16> @test_same_vzip1q_u16(<8 x i16> %a) {
-; CHECK: test_same_vzip1q_u16:
+; CHECK-LABEL: test_same_vzip1q_u16:
; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -1343,7 +1340,7 @@ entry:
}
define <4 x i32> @test_same_vzip1q_u32(<4 x i32> %a) {
-; CHECK: test_same_vzip1q_u32:
+; CHECK-LABEL: test_same_vzip1q_u32:
; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -1351,7 +1348,7 @@ entry:
}
define <4 x float> @test_same_vzip1q_f32(<4 x float> %a) {
-; CHECK: test_same_vzip1q_f32:
+; CHECK-LABEL: test_same_vzip1q_f32:
; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -1359,7 +1356,7 @@ entry:
}
define <8 x i8> @test_same_vzip1_p8(<8 x i8> %a) {
-; CHECK: test_same_vzip1_p8:
+; CHECK-LABEL: test_same_vzip1_p8:
; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -1367,7 +1364,7 @@ entry:
}
define <16 x i8> @test_same_vzip1q_p8(<16 x i8> %a) {
-; CHECK: test_same_vzip1q_p8:
+; CHECK-LABEL: test_same_vzip1q_p8:
; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
@@ -1375,7 +1372,7 @@ entry:
}
define <4 x i16> @test_same_vzip1_p16(<4 x i16> %a) {
-; CHECK: test_same_vzip1_p16:
+; CHECK-LABEL: test_same_vzip1_p16:
; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -1383,7 +1380,7 @@ entry:
}
define <8 x i16> @test_same_vzip1q_p16(<8 x i16> %a) {
-; CHECK: test_same_vzip1q_p16:
+; CHECK-LABEL: test_same_vzip1q_p16:
; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -1391,7 +1388,7 @@ entry:
}
define <8 x i8> @test_same_vzip2_s8(<8 x i8> %a) {
-; CHECK: test_same_vzip2_s8:
+; CHECK-LABEL: test_same_vzip2_s8:
; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -1399,7 +1396,7 @@ entry:
}
define <16 x i8> @test_same_vzip2q_s8(<16 x i8> %a) {
-; CHECK: test_same_vzip2q_s8:
+; CHECK-LABEL: test_same_vzip2q_s8:
; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
@@ -1407,7 +1404,7 @@ entry:
}
define <4 x i16> @test_same_vzip2_s16(<4 x i16> %a) {
-; CHECK: test_same_vzip2_s16:
+; CHECK-LABEL: test_same_vzip2_s16:
; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -1415,7 +1412,7 @@ entry:
}
define <8 x i16> @test_same_vzip2q_s16(<8 x i16> %a) {
-; CHECK: test_same_vzip2q_s16:
+; CHECK-LABEL: test_same_vzip2q_s16:
; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -1423,7 +1420,7 @@ entry:
}
define <4 x i32> @test_same_vzip2q_s32(<4 x i32> %a) {
-; CHECK: test_same_vzip2q_s32:
+; CHECK-LABEL: test_same_vzip2q_s32:
; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -1431,7 +1428,7 @@ entry:
}
define <8 x i8> @test_same_vzip2_u8(<8 x i8> %a) {
-; CHECK: test_same_vzip2_u8:
+; CHECK-LABEL: test_same_vzip2_u8:
; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -1439,7 +1436,7 @@ entry:
}
define <16 x i8> @test_same_vzip2q_u8(<16 x i8> %a) {
-; CHECK: test_same_vzip2q_u8:
+; CHECK-LABEL: test_same_vzip2q_u8:
; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
@@ -1447,7 +1444,7 @@ entry:
}
define <4 x i16> @test_same_vzip2_u16(<4 x i16> %a) {
-; CHECK: test_same_vzip2_u16:
+; CHECK-LABEL: test_same_vzip2_u16:
; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -1455,7 +1452,7 @@ entry:
}
define <8 x i16> @test_same_vzip2q_u16(<8 x i16> %a) {
-; CHECK: test_same_vzip2q_u16:
+; CHECK-LABEL: test_same_vzip2q_u16:
; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -1463,7 +1460,7 @@ entry:
}
define <4 x i32> @test_same_vzip2q_u32(<4 x i32> %a) {
-; CHECK: test_same_vzip2q_u32:
+; CHECK-LABEL: test_same_vzip2q_u32:
; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -1471,7 +1468,7 @@ entry:
}
define <4 x float> @test_same_vzip2q_f32(<4 x float> %a) {
-; CHECK: test_same_vzip2q_f32:
+; CHECK-LABEL: test_same_vzip2q_f32:
; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -1479,7 +1476,7 @@ entry:
}
define <8 x i8> @test_same_vzip2_p8(<8 x i8> %a) {
-; CHECK: test_same_vzip2_p8:
+; CHECK-LABEL: test_same_vzip2_p8:
; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -1487,7 +1484,7 @@ entry:
}
define <16 x i8> @test_same_vzip2q_p8(<16 x i8> %a) {
-; CHECK: test_same_vzip2q_p8:
+; CHECK-LABEL: test_same_vzip2q_p8:
; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
@@ -1495,7 +1492,7 @@ entry:
}
define <4 x i16> @test_same_vzip2_p16(<4 x i16> %a) {
-; CHECK: test_same_vzip2_p16:
+; CHECK-LABEL: test_same_vzip2_p16:
; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -1503,7 +1500,7 @@ entry:
}
define <8 x i16> @test_same_vzip2q_p16(<8 x i16> %a) {
-; CHECK: test_same_vzip2q_p16:
+; CHECK-LABEL: test_same_vzip2q_p16:
; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -1511,7 +1508,7 @@ entry:
}
define <8 x i8> @test_same_vtrn1_s8(<8 x i8> %a) {
-; CHECK: test_same_vtrn1_s8:
+; CHECK-LABEL: test_same_vtrn1_s8:
; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -1519,7 +1516,7 @@ entry:
}
define <16 x i8> @test_same_vtrn1q_s8(<16 x i8> %a) {
-; CHECK: test_same_vtrn1q_s8:
+; CHECK-LABEL: test_same_vtrn1q_s8:
; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
@@ -1527,7 +1524,7 @@ entry:
}
define <4 x i16> @test_same_vtrn1_s16(<4 x i16> %a) {
-; CHECK: test_same_vtrn1_s16:
+; CHECK-LABEL: test_same_vtrn1_s16:
; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -1535,7 +1532,7 @@ entry:
}
define <8 x i16> @test_same_vtrn1q_s16(<8 x i16> %a) {
-; CHECK: test_same_vtrn1q_s16:
+; CHECK-LABEL: test_same_vtrn1q_s16:
; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -1543,7 +1540,7 @@ entry:
}
define <4 x i32> @test_same_vtrn1q_s32(<4 x i32> %a) {
-; CHECK: test_same_vtrn1q_s32:
+; CHECK-LABEL: test_same_vtrn1q_s32:
; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -1551,7 +1548,7 @@ entry:
}
define <8 x i8> @test_same_vtrn1_u8(<8 x i8> %a) {
-; CHECK: test_same_vtrn1_u8:
+; CHECK-LABEL: test_same_vtrn1_u8:
; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -1559,7 +1556,7 @@ entry:
}
define <16 x i8> @test_same_vtrn1q_u8(<16 x i8> %a) {
-; CHECK: test_same_vtrn1q_u8:
+; CHECK-LABEL: test_same_vtrn1q_u8:
; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
@@ -1567,7 +1564,7 @@ entry:
}
define <4 x i16> @test_same_vtrn1_u16(<4 x i16> %a) {
-; CHECK: test_same_vtrn1_u16:
+; CHECK-LABEL: test_same_vtrn1_u16:
; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -1575,7 +1572,7 @@ entry:
}
define <8 x i16> @test_same_vtrn1q_u16(<8 x i16> %a) {
-; CHECK: test_same_vtrn1q_u16:
+; CHECK-LABEL: test_same_vtrn1q_u16:
; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -1583,7 +1580,7 @@ entry:
}
define <4 x i32> @test_same_vtrn1q_u32(<4 x i32> %a) {
-; CHECK: test_same_vtrn1q_u32:
+; CHECK-LABEL: test_same_vtrn1q_u32:
; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -1591,7 +1588,7 @@ entry:
}
define <4 x float> @test_same_vtrn1q_f32(<4 x float> %a) {
-; CHECK: test_same_vtrn1q_f32:
+; CHECK-LABEL: test_same_vtrn1q_f32:
; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -1599,7 +1596,7 @@ entry:
}
define <8 x i8> @test_same_vtrn1_p8(<8 x i8> %a) {
-; CHECK: test_same_vtrn1_p8:
+; CHECK-LABEL: test_same_vtrn1_p8:
; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -1607,7 +1604,7 @@ entry:
}
define <16 x i8> @test_same_vtrn1q_p8(<16 x i8> %a) {
-; CHECK: test_same_vtrn1q_p8:
+; CHECK-LABEL: test_same_vtrn1q_p8:
; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
@@ -1615,7 +1612,7 @@ entry:
}
define <4 x i16> @test_same_vtrn1_p16(<4 x i16> %a) {
-; CHECK: test_same_vtrn1_p16:
+; CHECK-LABEL: test_same_vtrn1_p16:
; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -1623,7 +1620,7 @@ entry:
}
define <8 x i16> @test_same_vtrn1q_p16(<8 x i16> %a) {
-; CHECK: test_same_vtrn1q_p16:
+; CHECK-LABEL: test_same_vtrn1q_p16:
; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -1631,7 +1628,7 @@ entry:
}
define <8 x i8> @test_same_vtrn2_s8(<8 x i8> %a) {
-; CHECK: test_same_vtrn2_s8:
+; CHECK-LABEL: test_same_vtrn2_s8:
; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -1639,7 +1636,7 @@ entry:
}
define <16 x i8> @test_same_vtrn2q_s8(<16 x i8> %a) {
-; CHECK: test_same_vtrn2q_s8:
+; CHECK-LABEL: test_same_vtrn2q_s8:
; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
@@ -1647,7 +1644,7 @@ entry:
}
define <4 x i16> @test_same_vtrn2_s16(<4 x i16> %a) {
-; CHECK: test_same_vtrn2_s16:
+; CHECK-LABEL: test_same_vtrn2_s16:
; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -1655,7 +1652,7 @@ entry:
}
define <8 x i16> @test_same_vtrn2q_s16(<8 x i16> %a) {
-; CHECK: test_same_vtrn2q_s16:
+; CHECK-LABEL: test_same_vtrn2q_s16:
; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -1663,7 +1660,7 @@ entry:
}
define <4 x i32> @test_same_vtrn2q_s32(<4 x i32> %a) {
-; CHECK: test_same_vtrn2q_s32:
+; CHECK-LABEL: test_same_vtrn2q_s32:
; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -1671,7 +1668,7 @@ entry:
}
define <8 x i8> @test_same_vtrn2_u8(<8 x i8> %a) {
-; CHECK: test_same_vtrn2_u8:
+; CHECK-LABEL: test_same_vtrn2_u8:
; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -1679,7 +1676,7 @@ entry:
}
define <16 x i8> @test_same_vtrn2q_u8(<16 x i8> %a) {
-; CHECK: test_same_vtrn2q_u8:
+; CHECK-LABEL: test_same_vtrn2q_u8:
; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
@@ -1687,7 +1684,7 @@ entry:
}
define <4 x i16> @test_same_vtrn2_u16(<4 x i16> %a) {
-; CHECK: test_same_vtrn2_u16:
+; CHECK-LABEL: test_same_vtrn2_u16:
; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -1695,7 +1692,7 @@ entry:
}
define <8 x i16> @test_same_vtrn2q_u16(<8 x i16> %a) {
-; CHECK: test_same_vtrn2q_u16:
+; CHECK-LABEL: test_same_vtrn2q_u16:
; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -1703,7 +1700,7 @@ entry:
}
define <4 x i32> @test_same_vtrn2q_u32(<4 x i32> %a) {
-; CHECK: test_same_vtrn2q_u32:
+; CHECK-LABEL: test_same_vtrn2q_u32:
; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -1711,7 +1708,7 @@ entry:
}
define <4 x float> @test_same_vtrn2q_f32(<4 x float> %a) {
-; CHECK: test_same_vtrn2q_f32:
+; CHECK-LABEL: test_same_vtrn2q_f32:
; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -1719,7 +1716,7 @@ entry:
}
define <8 x i8> @test_same_vtrn2_p8(<8 x i8> %a) {
-; CHECK: test_same_vtrn2_p8:
+; CHECK-LABEL: test_same_vtrn2_p8:
; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -1727,7 +1724,7 @@ entry:
}
define <16 x i8> @test_same_vtrn2q_p8(<16 x i8> %a) {
-; CHECK: test_same_vtrn2q_p8:
+; CHECK-LABEL: test_same_vtrn2q_p8:
; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
@@ -1735,7 +1732,7 @@ entry:
}
define <4 x i16> @test_same_vtrn2_p16(<4 x i16> %a) {
-; CHECK: test_same_vtrn2_p16:
+; CHECK-LABEL: test_same_vtrn2_p16:
; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -1743,7 +1740,7 @@ entry:
}
define <8 x i16> @test_same_vtrn2q_p16(<8 x i16> %a) {
-; CHECK: test_same_vtrn2q_p16:
+; CHECK-LABEL: test_same_vtrn2q_p16:
; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -1752,7 +1749,7 @@ entry:
define <8 x i8> @test_undef_vuzp1_s8(<8 x i8> %a) {
-; CHECK: test_undef_vuzp1_s8:
+; CHECK-LABEL: test_undef_vuzp1_s8:
; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -1760,7 +1757,7 @@ entry:
}
define <16 x i8> @test_undef_vuzp1q_s8(<16 x i8> %a) {
-; CHECK: test_undef_vuzp1q_s8:
+; CHECK-LABEL: test_undef_vuzp1q_s8:
; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -1768,7 +1765,7 @@ entry:
}
define <4 x i16> @test_undef_vuzp1_s16(<4 x i16> %a) {
-; CHECK: test_undef_vuzp1_s16:
+; CHECK-LABEL: test_undef_vuzp1_s16:
; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1776,7 +1773,7 @@ entry:
}
define <8 x i16> @test_undef_vuzp1q_s16(<8 x i16> %a) {
-; CHECK: test_undef_vuzp1q_s16:
+; CHECK-LABEL: test_undef_vuzp1q_s16:
; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -1784,7 +1781,7 @@ entry:
}
define <4 x i32> @test_undef_vuzp1q_s32(<4 x i32> %a) {
-; CHECK: test_undef_vuzp1q_s32:
+; CHECK-LABEL: test_undef_vuzp1q_s32:
; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1792,7 +1789,7 @@ entry:
}
define <8 x i8> @test_undef_vuzp1_u8(<8 x i8> %a) {
-; CHECK: test_undef_vuzp1_u8:
+; CHECK-LABEL: test_undef_vuzp1_u8:
; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -1800,7 +1797,7 @@ entry:
}
define <16 x i8> @test_undef_vuzp1q_u8(<16 x i8> %a) {
-; CHECK: test_undef_vuzp1q_u8:
+; CHECK-LABEL: test_undef_vuzp1q_u8:
; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -1808,7 +1805,7 @@ entry:
}
define <4 x i16> @test_undef_vuzp1_u16(<4 x i16> %a) {
-; CHECK: test_undef_vuzp1_u16:
+; CHECK-LABEL: test_undef_vuzp1_u16:
; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1816,7 +1813,7 @@ entry:
}
define <8 x i16> @test_undef_vuzp1q_u16(<8 x i16> %a) {
-; CHECK: test_undef_vuzp1q_u16:
+; CHECK-LABEL: test_undef_vuzp1q_u16:
; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -1824,7 +1821,7 @@ entry:
}
define <4 x i32> @test_undef_vuzp1q_u32(<4 x i32> %a) {
-; CHECK: test_undef_vuzp1q_u32:
+; CHECK-LABEL: test_undef_vuzp1q_u32:
; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1832,7 +1829,7 @@ entry:
}
define <4 x float> @test_undef_vuzp1q_f32(<4 x float> %a) {
-; CHECK: test_undef_vuzp1q_f32:
+; CHECK-LABEL: test_undef_vuzp1q_f32:
; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1840,7 +1837,7 @@ entry:
}
define <8 x i8> @test_undef_vuzp1_p8(<8 x i8> %a) {
-; CHECK: test_undef_vuzp1_p8:
+; CHECK-LABEL: test_undef_vuzp1_p8:
; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -1848,7 +1845,7 @@ entry:
}
define <16 x i8> @test_undef_vuzp1q_p8(<16 x i8> %a) {
-; CHECK: test_undef_vuzp1q_p8:
+; CHECK-LABEL: test_undef_vuzp1q_p8:
; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -1856,7 +1853,7 @@ entry:
}
define <4 x i16> @test_undef_vuzp1_p16(<4 x i16> %a) {
-; CHECK: test_undef_vuzp1_p16:
+; CHECK-LABEL: test_undef_vuzp1_p16:
; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -1864,7 +1861,7 @@ entry:
}
define <8 x i16> @test_undef_vuzp1q_p16(<8 x i16> %a) {
-; CHECK: test_undef_vuzp1q_p16:
+; CHECK-LABEL: test_undef_vuzp1q_p16:
; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -1872,7 +1869,7 @@ entry:
}
define <8 x i8> @test_undef_vuzp2_s8(<8 x i8> %a) {
-; CHECK: test_undef_vuzp2_s8:
+; CHECK-LABEL: test_undef_vuzp2_s8:
; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -1880,7 +1877,7 @@ entry:
}
define <16 x i8> @test_undef_vuzp2q_s8(<16 x i8> %a) {
-; CHECK: test_undef_vuzp2q_s8:
+; CHECK-LABEL: test_undef_vuzp2q_s8:
; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
@@ -1888,7 +1885,7 @@ entry:
}
define <4 x i16> @test_undef_vuzp2_s16(<4 x i16> %a) {
-; CHECK: test_undef_vuzp2_s16:
+; CHECK-LABEL: test_undef_vuzp2_s16:
; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -1896,7 +1893,7 @@ entry:
}
define <8 x i16> @test_undef_vuzp2q_s16(<8 x i16> %a) {
-; CHECK: test_undef_vuzp2q_s16:
+; CHECK-LABEL: test_undef_vuzp2q_s16:
; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -1904,7 +1901,7 @@ entry:
}
define <4 x i32> @test_undef_vuzp2q_s32(<4 x i32> %a) {
-; CHECK: test_undef_vuzp2q_s32:
+; CHECK-LABEL: test_undef_vuzp2q_s32:
; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -1912,7 +1909,7 @@ entry:
}
define <8 x i8> @test_undef_vuzp2_u8(<8 x i8> %a) {
-; CHECK: test_undef_vuzp2_u8:
+; CHECK-LABEL: test_undef_vuzp2_u8:
; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -1920,7 +1917,7 @@ entry:
}
define <16 x i8> @test_undef_vuzp2q_u8(<16 x i8> %a) {
-; CHECK: test_undef_vuzp2q_u8:
+; CHECK-LABEL: test_undef_vuzp2q_u8:
; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
@@ -1928,7 +1925,7 @@ entry:
}
define <4 x i16> @test_undef_vuzp2_u16(<4 x i16> %a) {
-; CHECK: test_undef_vuzp2_u16:
+; CHECK-LABEL: test_undef_vuzp2_u16:
; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -1936,7 +1933,7 @@ entry:
}
define <8 x i16> @test_undef_vuzp2q_u16(<8 x i16> %a) {
-; CHECK: test_undef_vuzp2q_u16:
+; CHECK-LABEL: test_undef_vuzp2q_u16:
; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -1944,7 +1941,7 @@ entry:
}
define <4 x i32> @test_undef_vuzp2q_u32(<4 x i32> %a) {
-; CHECK: test_undef_vuzp2q_u32:
+; CHECK-LABEL: test_undef_vuzp2q_u32:
; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -1952,7 +1949,7 @@ entry:
}
define <4 x float> @test_undef_vuzp2q_f32(<4 x float> %a) {
-; CHECK: test_undef_vuzp2q_f32:
+; CHECK-LABEL: test_undef_vuzp2q_f32:
; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -1960,7 +1957,7 @@ entry:
}
define <8 x i8> @test_undef_vuzp2_p8(<8 x i8> %a) {
-; CHECK: test_undef_vuzp2_p8:
+; CHECK-LABEL: test_undef_vuzp2_p8:
; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -1968,7 +1965,7 @@ entry:
}
define <16 x i8> @test_undef_vuzp2q_p8(<16 x i8> %a) {
-; CHECK: test_undef_vuzp2q_p8:
+; CHECK-LABEL: test_undef_vuzp2q_p8:
; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
@@ -1976,7 +1973,7 @@ entry:
}
define <4 x i16> @test_undef_vuzp2_p16(<4 x i16> %a) {
-; CHECK: test_undef_vuzp2_p16:
+; CHECK-LABEL: test_undef_vuzp2_p16:
; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -1984,7 +1981,7 @@ entry:
}
define <8 x i16> @test_undef_vuzp2q_p16(<8 x i16> %a) {
-; CHECK: test_undef_vuzp2q_p16:
+; CHECK-LABEL: test_undef_vuzp2q_p16:
; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -1992,7 +1989,7 @@ entry:
}
define <8 x i8> @test_undef_vzip1_s8(<8 x i8> %a) {
-; CHECK: test_undef_vzip1_s8:
+; CHECK-LABEL: test_undef_vzip1_s8:
; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -2000,7 +1997,7 @@ entry:
}
define <16 x i8> @test_undef_vzip1q_s8(<16 x i8> %a) {
-; CHECK: test_undef_vzip1q_s8:
+; CHECK-LABEL: test_undef_vzip1q_s8:
; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
@@ -2008,7 +2005,7 @@ entry:
}
define <4 x i16> @test_undef_vzip1_s16(<4 x i16> %a) {
-; CHECK: test_undef_vzip1_s16:
+; CHECK-LABEL: test_undef_vzip1_s16:
; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -2016,7 +2013,7 @@ entry:
}
define <8 x i16> @test_undef_vzip1q_s16(<8 x i16> %a) {
-; CHECK: test_undef_vzip1q_s16:
+; CHECK-LABEL: test_undef_vzip1q_s16:
; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -2024,7 +2021,7 @@ entry:
}
define <4 x i32> @test_undef_vzip1q_s32(<4 x i32> %a) {
-; CHECK: test_undef_vzip1q_s32:
+; CHECK-LABEL: test_undef_vzip1q_s32:
; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -2032,7 +2029,7 @@ entry:
}
define <8 x i8> @test_undef_vzip1_u8(<8 x i8> %a) {
-; CHECK: test_undef_vzip1_u8:
+; CHECK-LABEL: test_undef_vzip1_u8:
; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -2040,7 +2037,7 @@ entry:
}
define <16 x i8> @test_undef_vzip1q_u8(<16 x i8> %a) {
-; CHECK: test_undef_vzip1q_u8:
+; CHECK-LABEL: test_undef_vzip1q_u8:
; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
@@ -2048,7 +2045,7 @@ entry:
}
define <4 x i16> @test_undef_vzip1_u16(<4 x i16> %a) {
-; CHECK: test_undef_vzip1_u16:
+; CHECK-LABEL: test_undef_vzip1_u16:
; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -2056,7 +2053,7 @@ entry:
}
define <8 x i16> @test_undef_vzip1q_u16(<8 x i16> %a) {
-; CHECK: test_undef_vzip1q_u16:
+; CHECK-LABEL: test_undef_vzip1q_u16:
; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -2064,7 +2061,7 @@ entry:
}
define <4 x i32> @test_undef_vzip1q_u32(<4 x i32> %a) {
-; CHECK: test_undef_vzip1q_u32:
+; CHECK-LABEL: test_undef_vzip1q_u32:
; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -2072,7 +2069,7 @@ entry:
}
define <4 x float> @test_undef_vzip1q_f32(<4 x float> %a) {
-; CHECK: test_undef_vzip1q_f32:
+; CHECK-LABEL: test_undef_vzip1q_f32:
; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -2080,7 +2077,7 @@ entry:
}
define <8 x i8> @test_undef_vzip1_p8(<8 x i8> %a) {
-; CHECK: test_undef_vzip1_p8:
+; CHECK-LABEL: test_undef_vzip1_p8:
; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -2088,7 +2085,7 @@ entry:
}
define <16 x i8> @test_undef_vzip1q_p8(<16 x i8> %a) {
-; CHECK: test_undef_vzip1q_p8:
+; CHECK-LABEL: test_undef_vzip1q_p8:
; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
@@ -2096,7 +2093,7 @@ entry:
}
define <4 x i16> @test_undef_vzip1_p16(<4 x i16> %a) {
-; CHECK: test_undef_vzip1_p16:
+; CHECK-LABEL: test_undef_vzip1_p16:
; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -2104,7 +2101,7 @@ entry:
}
define <8 x i16> @test_undef_vzip1q_p16(<8 x i16> %a) {
-; CHECK: test_undef_vzip1q_p16:
+; CHECK-LABEL: test_undef_vzip1q_p16:
; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -2112,7 +2109,7 @@ entry:
}
define <8 x i8> @test_undef_vzip2_s8(<8 x i8> %a) {
-; CHECK: test_undef_vzip2_s8:
+; CHECK-LABEL: test_undef_vzip2_s8:
; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -2120,7 +2117,7 @@ entry:
}
define <16 x i8> @test_undef_vzip2q_s8(<16 x i8> %a) {
-; CHECK: test_undef_vzip2q_s8:
+; CHECK-LABEL: test_undef_vzip2q_s8:
; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
@@ -2128,7 +2125,7 @@ entry:
}
define <4 x i16> @test_undef_vzip2_s16(<4 x i16> %a) {
-; CHECK: test_undef_vzip2_s16:
+; CHECK-LABEL: test_undef_vzip2_s16:
; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -2136,7 +2133,7 @@ entry:
}
define <8 x i16> @test_undef_vzip2q_s16(<8 x i16> %a) {
-; CHECK: test_undef_vzip2q_s16:
+; CHECK-LABEL: test_undef_vzip2q_s16:
; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -2144,7 +2141,7 @@ entry:
}
define <4 x i32> @test_undef_vzip2q_s32(<4 x i32> %a) {
-; CHECK: test_undef_vzip2q_s32:
+; CHECK-LABEL: test_undef_vzip2q_s32:
; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -2152,7 +2149,7 @@ entry:
}
define <8 x i8> @test_undef_vzip2_u8(<8 x i8> %a) {
-; CHECK: test_undef_vzip2_u8:
+; CHECK-LABEL: test_undef_vzip2_u8:
; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -2160,7 +2157,7 @@ entry:
}
define <16 x i8> @test_undef_vzip2q_u8(<16 x i8> %a) {
-; CHECK: test_undef_vzip2q_u8:
+; CHECK-LABEL: test_undef_vzip2q_u8:
; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
@@ -2168,7 +2165,7 @@ entry:
}
define <4 x i16> @test_undef_vzip2_u16(<4 x i16> %a) {
-; CHECK: test_undef_vzip2_u16:
+; CHECK-LABEL: test_undef_vzip2_u16:
; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -2176,7 +2173,7 @@ entry:
}
define <8 x i16> @test_undef_vzip2q_u16(<8 x i16> %a) {
-; CHECK: test_undef_vzip2q_u16:
+; CHECK-LABEL: test_undef_vzip2q_u16:
; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -2184,7 +2181,7 @@ entry:
}
define <4 x i32> @test_undef_vzip2q_u32(<4 x i32> %a) {
-; CHECK: test_undef_vzip2q_u32:
+; CHECK-LABEL: test_undef_vzip2q_u32:
; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -2192,7 +2189,7 @@ entry:
}
define <4 x float> @test_undef_vzip2q_f32(<4 x float> %a) {
-; CHECK: test_undef_vzip2q_f32:
+; CHECK-LABEL: test_undef_vzip2q_f32:
; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -2200,7 +2197,7 @@ entry:
}
define <8 x i8> @test_undef_vzip2_p8(<8 x i8> %a) {
-; CHECK: test_undef_vzip2_p8:
+; CHECK-LABEL: test_undef_vzip2_p8:
; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -2208,7 +2205,7 @@ entry:
}
define <16 x i8> @test_undef_vzip2q_p8(<16 x i8> %a) {
-; CHECK: test_undef_vzip2q_p8:
+; CHECK-LABEL: test_undef_vzip2q_p8:
; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
@@ -2216,7 +2213,7 @@ entry:
}
define <4 x i16> @test_undef_vzip2_p16(<4 x i16> %a) {
-; CHECK: test_undef_vzip2_p16:
+; CHECK-LABEL: test_undef_vzip2_p16:
; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -2224,7 +2221,7 @@ entry:
}
define <8 x i16> @test_undef_vzip2q_p16(<8 x i16> %a) {
-; CHECK: test_undef_vzip2q_p16:
+; CHECK-LABEL: test_undef_vzip2q_p16:
; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -2232,7 +2229,7 @@ entry:
}
define <8 x i8> @test_undef_vtrn1_s8(<8 x i8> %a) {
-; CHECK: test_undef_vtrn1_s8:
+; CHECK-LABEL: test_undef_vtrn1_s8:
; CHECK: ret
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -2240,7 +2237,7 @@ entry:
}
define <16 x i8> @test_undef_vtrn1q_s8(<16 x i8> %a) {
-; CHECK: test_undef_vtrn1q_s8:
+; CHECK-LABEL: test_undef_vtrn1q_s8:
; CHECK: ret
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
@@ -2248,7 +2245,7 @@ entry:
}
define <4 x i16> @test_undef_vtrn1_s16(<4 x i16> %a) {
-; CHECK: test_undef_vtrn1_s16:
+; CHECK-LABEL: test_undef_vtrn1_s16:
; CHECK: ret
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -2256,7 +2253,7 @@ entry:
}
define <8 x i16> @test_undef_vtrn1q_s16(<8 x i16> %a) {
-; CHECK: test_undef_vtrn1q_s16:
+; CHECK-LABEL: test_undef_vtrn1q_s16:
; CHECK: ret
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -2264,7 +2261,7 @@ entry:
}
define <4 x i32> @test_undef_vtrn1q_s32(<4 x i32> %a) {
-; CHECK: test_undef_vtrn1q_s32:
+; CHECK-LABEL: test_undef_vtrn1q_s32:
; CHECK: ret
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -2272,7 +2269,7 @@ entry:
}
define <8 x i8> @test_undef_vtrn1_u8(<8 x i8> %a) {
-; CHECK: test_undef_vtrn1_u8:
+; CHECK-LABEL: test_undef_vtrn1_u8:
; CHECK: ret
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -2280,7 +2277,7 @@ entry:
}
define <16 x i8> @test_undef_vtrn1q_u8(<16 x i8> %a) {
-; CHECK: test_undef_vtrn1q_u8:
+; CHECK-LABEL: test_undef_vtrn1q_u8:
; CHECK: ret
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
@@ -2288,7 +2285,7 @@ entry:
}
define <4 x i16> @test_undef_vtrn1_u16(<4 x i16> %a) {
-; CHECK: test_undef_vtrn1_u16:
+; CHECK-LABEL: test_undef_vtrn1_u16:
; CHECK: ret
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -2296,7 +2293,7 @@ entry:
}
define <8 x i16> @test_undef_vtrn1q_u16(<8 x i16> %a) {
-; CHECK: test_undef_vtrn1q_u16:
+; CHECK-LABEL: test_undef_vtrn1q_u16:
; CHECK: ret
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -2304,7 +2301,7 @@ entry:
}
define <4 x i32> @test_undef_vtrn1q_u32(<4 x i32> %a) {
-; CHECK: test_undef_vtrn1q_u32:
+; CHECK-LABEL: test_undef_vtrn1q_u32:
; CHECK: ret
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -2312,7 +2309,7 @@ entry:
}
define <4 x float> @test_undef_vtrn1q_f32(<4 x float> %a) {
-; CHECK: test_undef_vtrn1q_f32:
+; CHECK-LABEL: test_undef_vtrn1q_f32:
; CHECK: ret
entry:
%shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -2320,7 +2317,7 @@ entry:
}
define <8 x i8> @test_undef_vtrn1_p8(<8 x i8> %a) {
-; CHECK: test_undef_vtrn1_p8:
+; CHECK-LABEL: test_undef_vtrn1_p8:
; CHECK: ret
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -2328,7 +2325,7 @@ entry:
}
define <16 x i8> @test_undef_vtrn1q_p8(<16 x i8> %a) {
-; CHECK: test_undef_vtrn1q_p8:
+; CHECK-LABEL: test_undef_vtrn1q_p8:
; CHECK: ret
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
@@ -2336,7 +2333,7 @@ entry:
}
define <4 x i16> @test_undef_vtrn1_p16(<4 x i16> %a) {
-; CHECK: test_undef_vtrn1_p16:
+; CHECK-LABEL: test_undef_vtrn1_p16:
; CHECK: ret
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -2344,7 +2341,7 @@ entry:
}
define <8 x i16> @test_undef_vtrn1q_p16(<8 x i16> %a) {
-; CHECK: test_undef_vtrn1q_p16:
+; CHECK-LABEL: test_undef_vtrn1q_p16:
; CHECK: ret
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -2352,7 +2349,7 @@ entry:
}
define <8 x i8> @test_undef_vtrn2_s8(<8 x i8> %a) {
-; CHECK: test_undef_vtrn2_s8:
+; CHECK-LABEL: test_undef_vtrn2_s8:
; CHECK: rev16 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -2360,7 +2357,7 @@ entry:
}
define <16 x i8> @test_undef_vtrn2q_s8(<16 x i8> %a) {
-; CHECK: test_undef_vtrn2q_s8:
+; CHECK-LABEL: test_undef_vtrn2q_s8:
; CHECK: rev16 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
@@ -2368,7 +2365,7 @@ entry:
}
define <4 x i16> @test_undef_vtrn2_s16(<4 x i16> %a) {
-; CHECK: test_undef_vtrn2_s16:
+; CHECK-LABEL: test_undef_vtrn2_s16:
; CHECK: rev32 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -2376,7 +2373,7 @@ entry:
}
define <8 x i16> @test_undef_vtrn2q_s16(<8 x i16> %a) {
-; CHECK: test_undef_vtrn2q_s16:
+; CHECK-LABEL: test_undef_vtrn2q_s16:
; CHECK: rev32 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -2384,7 +2381,7 @@ entry:
}
define <4 x i32> @test_undef_vtrn2q_s32(<4 x i32> %a) {
-; CHECK: test_undef_vtrn2q_s32:
+; CHECK-LABEL: test_undef_vtrn2q_s32:
; CHECK: rev64 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -2392,7 +2389,7 @@ entry:
}
define <8 x i8> @test_undef_vtrn2_u8(<8 x i8> %a) {
-; CHECK: test_undef_vtrn2_u8:
+; CHECK-LABEL: test_undef_vtrn2_u8:
; CHECK: rev16 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -2400,7 +2397,7 @@ entry:
}
define <16 x i8> @test_undef_vtrn2q_u8(<16 x i8> %a) {
-; CHECK: test_undef_vtrn2q_u8:
+; CHECK-LABEL: test_undef_vtrn2q_u8:
; CHECK: rev16 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
@@ -2408,7 +2405,7 @@ entry:
}
define <4 x i16> @test_undef_vtrn2_u16(<4 x i16> %a) {
-; CHECK: test_undef_vtrn2_u16:
+; CHECK-LABEL: test_undef_vtrn2_u16:
; CHECK: rev32 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -2416,7 +2413,7 @@ entry:
}
define <8 x i16> @test_undef_vtrn2q_u16(<8 x i16> %a) {
-; CHECK: test_undef_vtrn2q_u16:
+; CHECK-LABEL: test_undef_vtrn2q_u16:
; CHECK: rev32 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -2424,7 +2421,7 @@ entry:
}
define <4 x i32> @test_undef_vtrn2q_u32(<4 x i32> %a) {
-; CHECK: test_undef_vtrn2q_u32:
+; CHECK-LABEL: test_undef_vtrn2q_u32:
; CHECK: rev64 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -2432,7 +2429,7 @@ entry:
}
define <4 x float> @test_undef_vtrn2q_f32(<4 x float> %a) {
-; CHECK: test_undef_vtrn2q_f32:
+; CHECK-LABEL: test_undef_vtrn2q_f32:
; CHECK: rev64 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
%shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -2440,7 +2437,7 @@ entry:
}
define <8 x i8> @test_undef_vtrn2_p8(<8 x i8> %a) {
-; CHECK: test_undef_vtrn2_p8:
+; CHECK-LABEL: test_undef_vtrn2_p8:
; CHECK: rev16 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
%shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -2448,7 +2445,7 @@ entry:
}
define <16 x i8> @test_undef_vtrn2q_p8(<16 x i8> %a) {
-; CHECK: test_undef_vtrn2q_p8:
+; CHECK-LABEL: test_undef_vtrn2q_p8:
; CHECK: rev16 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
%shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
@@ -2456,7 +2453,7 @@ entry:
}
define <4 x i16> @test_undef_vtrn2_p16(<4 x i16> %a) {
-; CHECK: test_undef_vtrn2_p16:
+; CHECK-LABEL: test_undef_vtrn2_p16:
; CHECK: rev32 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
%shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -2464,7 +2461,7 @@ entry:
}
define <8 x i16> @test_undef_vtrn2q_p16(<8 x i16> %a) {
-; CHECK: test_undef_vtrn2q_p16:
+; CHECK-LABEL: test_undef_vtrn2q_p16:
; CHECK: rev32 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -2472,7 +2469,7 @@ entry:
}
define %struct.int8x8x2_t @test_vuzp_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vuzp_s8:
+; CHECK-LABEL: test_vuzp_s8:
; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
@@ -2484,7 +2481,7 @@ entry:
}
define %struct.int16x4x2_t @test_vuzp_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vuzp_s16:
+; CHECK-LABEL: test_vuzp_s16:
; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
@@ -2496,9 +2493,9 @@ entry:
}
define %struct.int32x2x2_t @test_vuzp_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vuzp_s32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vuzp_s32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
entry:
%vuzp.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
%vuzp1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
@@ -2508,7 +2505,7 @@ entry:
}
define %struct.uint8x8x2_t @test_vuzp_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vuzp_u8:
+; CHECK-LABEL: test_vuzp_u8:
; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
@@ -2520,7 +2517,7 @@ entry:
}
define %struct.uint16x4x2_t @test_vuzp_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vuzp_u16:
+; CHECK-LABEL: test_vuzp_u16:
; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
@@ -2532,9 +2529,9 @@ entry:
}
define %struct.uint32x2x2_t @test_vuzp_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vuzp_u32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vuzp_u32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
entry:
%vuzp.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
%vuzp1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
@@ -2544,9 +2541,9 @@ entry:
}
define %struct.float32x2x2_t @test_vuzp_f32(<2 x float> %a, <2 x float> %b) {
-; CHECK: test_vuzp_f32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vuzp_f32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
entry:
%vuzp.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
%vuzp1.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
@@ -2556,7 +2553,7 @@ entry:
}
define %struct.poly8x8x2_t @test_vuzp_p8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vuzp_p8:
+; CHECK-LABEL: test_vuzp_p8:
; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
@@ -2568,7 +2565,7 @@ entry:
}
define %struct.poly16x4x2_t @test_vuzp_p16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vuzp_p16:
+; CHECK-LABEL: test_vuzp_p16:
; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
@@ -2580,7 +2577,7 @@ entry:
}
define %struct.int8x16x2_t @test_vuzpq_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vuzpq_s8:
+; CHECK-LABEL: test_vuzpq_s8:
; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
@@ -2592,7 +2589,7 @@ entry:
}
define %struct.int16x8x2_t @test_vuzpq_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vuzpq_s16:
+; CHECK-LABEL: test_vuzpq_s16:
; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
@@ -2604,7 +2601,7 @@ entry:
}
define %struct.int32x4x2_t @test_vuzpq_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vuzpq_s32:
+; CHECK-LABEL: test_vuzpq_s32:
; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
@@ -2616,7 +2613,7 @@ entry:
}
define %struct.uint8x16x2_t @test_vuzpq_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vuzpq_u8:
+; CHECK-LABEL: test_vuzpq_u8:
; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
@@ -2628,7 +2625,7 @@ entry:
}
define %struct.uint16x8x2_t @test_vuzpq_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vuzpq_u16:
+; CHECK-LABEL: test_vuzpq_u16:
; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
@@ -2640,7 +2637,7 @@ entry:
}
define %struct.uint32x4x2_t @test_vuzpq_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vuzpq_u32:
+; CHECK-LABEL: test_vuzpq_u32:
; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
@@ -2652,7 +2649,7 @@ entry:
}
define %struct.float32x4x2_t @test_vuzpq_f32(<4 x float> %a, <4 x float> %b) {
-; CHECK: test_vuzpq_f32:
+; CHECK-LABEL: test_vuzpq_f32:
; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
@@ -2664,7 +2661,7 @@ entry:
}
define %struct.poly8x16x2_t @test_vuzpq_p8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vuzpq_p8:
+; CHECK-LABEL: test_vuzpq_p8:
; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
@@ -2676,7 +2673,7 @@ entry:
}
define %struct.poly16x8x2_t @test_vuzpq_p16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vuzpq_p16:
+; CHECK-LABEL: test_vuzpq_p16:
; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
@@ -2688,7 +2685,7 @@ entry:
}
define %struct.int8x8x2_t @test_vzip_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vzip_s8:
+; CHECK-LABEL: test_vzip_s8:
; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
@@ -2700,7 +2697,7 @@ entry:
}
define %struct.int16x4x2_t @test_vzip_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vzip_s16:
+; CHECK-LABEL: test_vzip_s16:
; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
@@ -2712,9 +2709,9 @@ entry:
}
define %struct.int32x2x2_t @test_vzip_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vzip_s32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vzip_s32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
entry:
%vzip.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
%vzip1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
@@ -2724,7 +2721,7 @@ entry:
}
define %struct.uint8x8x2_t @test_vzip_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vzip_u8:
+; CHECK-LABEL: test_vzip_u8:
; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
@@ -2736,7 +2733,7 @@ entry:
}
define %struct.uint16x4x2_t @test_vzip_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vzip_u16:
+; CHECK-LABEL: test_vzip_u16:
; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
@@ -2748,9 +2745,9 @@ entry:
}
define %struct.uint32x2x2_t @test_vzip_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vzip_u32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vzip_u32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
entry:
%vzip.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
%vzip1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
@@ -2760,9 +2757,9 @@ entry:
}
define %struct.float32x2x2_t @test_vzip_f32(<2 x float> %a, <2 x float> %b) {
-; CHECK: test_vzip_f32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vzip_f32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
entry:
%vzip.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
%vzip1.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
@@ -2772,7 +2769,7 @@ entry:
}
define %struct.poly8x8x2_t @test_vzip_p8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vzip_p8:
+; CHECK-LABEL: test_vzip_p8:
; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
@@ -2784,7 +2781,7 @@ entry:
}
define %struct.poly16x4x2_t @test_vzip_p16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vzip_p16:
+; CHECK-LABEL: test_vzip_p16:
; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
@@ -2796,7 +2793,7 @@ entry:
}
define %struct.int8x16x2_t @test_vzipq_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vzipq_s8:
+; CHECK-LABEL: test_vzipq_s8:
; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
@@ -2808,7 +2805,7 @@ entry:
}
define %struct.int16x8x2_t @test_vzipq_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vzipq_s16:
+; CHECK-LABEL: test_vzipq_s16:
; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
@@ -2820,7 +2817,7 @@ entry:
}
define %struct.int32x4x2_t @test_vzipq_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vzipq_s32:
+; CHECK-LABEL: test_vzipq_s32:
; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
@@ -2832,7 +2829,7 @@ entry:
}
define %struct.uint8x16x2_t @test_vzipq_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vzipq_u8:
+; CHECK-LABEL: test_vzipq_u8:
; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
@@ -2844,7 +2841,7 @@ entry:
}
define %struct.uint16x8x2_t @test_vzipq_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vzipq_u16:
+; CHECK-LABEL: test_vzipq_u16:
; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
@@ -2856,7 +2853,7 @@ entry:
}
define %struct.uint32x4x2_t @test_vzipq_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vzipq_u32:
+; CHECK-LABEL: test_vzipq_u32:
; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
@@ -2868,7 +2865,7 @@ entry:
}
define %struct.float32x4x2_t @test_vzipq_f32(<4 x float> %a, <4 x float> %b) {
-; CHECK: test_vzipq_f32:
+; CHECK-LABEL: test_vzipq_f32:
; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
@@ -2880,7 +2877,7 @@ entry:
}
define %struct.poly8x16x2_t @test_vzipq_p8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vzipq_p8:
+; CHECK-LABEL: test_vzipq_p8:
; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
@@ -2892,7 +2889,7 @@ entry:
}
define %struct.poly16x8x2_t @test_vzipq_p16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vzipq_p16:
+; CHECK-LABEL: test_vzipq_p16:
; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
@@ -2904,7 +2901,7 @@ entry:
}
define %struct.int8x8x2_t @test_vtrn_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vtrn_s8:
+; CHECK-LABEL: test_vtrn_s8:
; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
@@ -2916,7 +2913,7 @@ entry:
}
define %struct.int16x4x2_t @test_vtrn_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vtrn_s16:
+; CHECK-LABEL: test_vtrn_s16:
; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
@@ -2928,9 +2925,9 @@ entry:
}
define %struct.int32x2x2_t @test_vtrn_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vtrn_s32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vtrn_s32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
entry:
%vtrn.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
%vtrn1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
@@ -2940,7 +2937,7 @@ entry:
}
define %struct.uint8x8x2_t @test_vtrn_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vtrn_u8:
+; CHECK-LABEL: test_vtrn_u8:
; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
@@ -2952,7 +2949,7 @@ entry:
}
define %struct.uint16x4x2_t @test_vtrn_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vtrn_u16:
+; CHECK-LABEL: test_vtrn_u16:
; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
@@ -2964,9 +2961,9 @@ entry:
}
define %struct.uint32x2x2_t @test_vtrn_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vtrn_u32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vtrn_u32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
entry:
%vtrn.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
%vtrn1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
@@ -2976,9 +2973,9 @@ entry:
}
define %struct.float32x2x2_t @test_vtrn_f32(<2 x float> %a, <2 x float> %b) {
-; CHECK: test_vtrn_f32:
-; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
-; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+; CHECK-LABEL: test_vtrn_f32:
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
entry:
%vtrn.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
%vtrn1.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
@@ -2988,7 +2985,7 @@ entry:
}
define %struct.poly8x8x2_t @test_vtrn_p8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vtrn_p8:
+; CHECK-LABEL: test_vtrn_p8:
; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
entry:
@@ -3000,7 +2997,7 @@ entry:
}
define %struct.poly16x4x2_t @test_vtrn_p16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vtrn_p16:
+; CHECK-LABEL: test_vtrn_p16:
; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
entry:
@@ -3012,7 +3009,7 @@ entry:
}
define %struct.int8x16x2_t @test_vtrnq_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vtrnq_s8:
+; CHECK-LABEL: test_vtrnq_s8:
; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
@@ -3024,7 +3021,7 @@ entry:
}
define %struct.int16x8x2_t @test_vtrnq_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vtrnq_s16:
+; CHECK-LABEL: test_vtrnq_s16:
; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
@@ -3036,7 +3033,7 @@ entry:
}
define %struct.int32x4x2_t @test_vtrnq_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vtrnq_s32:
+; CHECK-LABEL: test_vtrnq_s32:
; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
@@ -3048,7 +3045,7 @@ entry:
}
define %struct.uint8x16x2_t @test_vtrnq_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vtrnq_u8:
+; CHECK-LABEL: test_vtrnq_u8:
; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
@@ -3060,7 +3057,7 @@ entry:
}
define %struct.uint16x8x2_t @test_vtrnq_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vtrnq_u16:
+; CHECK-LABEL: test_vtrnq_u16:
; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
@@ -3072,7 +3069,7 @@ entry:
}
define %struct.uint32x4x2_t @test_vtrnq_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vtrnq_u32:
+; CHECK-LABEL: test_vtrnq_u32:
; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
@@ -3084,7 +3081,7 @@ entry:
}
define %struct.float32x4x2_t @test_vtrnq_f32(<4 x float> %a, <4 x float> %b) {
-; CHECK: test_vtrnq_f32:
+; CHECK-LABEL: test_vtrnq_f32:
; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
entry:
@@ -3096,7 +3093,7 @@ entry:
}
define %struct.poly8x16x2_t @test_vtrnq_p8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vtrnq_p8:
+; CHECK-LABEL: test_vtrnq_p8:
; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
entry:
@@ -3108,7 +3105,7 @@ entry:
}
define %struct.poly16x8x2_t @test_vtrnq_p16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vtrnq_p16:
+; CHECK-LABEL: test_vtrnq_p16:
; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
@@ -3120,7 +3117,7 @@ entry:
}
define %struct.uint8x8x2_t @test_uzp(<16 x i8> %y) {
-; CHECK: test_uzp:
+; CHECK-LABEL: test_uzp:
%vuzp.i = shufflevector <16 x i8> %y, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
%vuzp1.i = shufflevector <16 x i8> %y, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -3128,7 +3125,4 @@ define %struct.uint8x8x2_t @test_uzp(<16 x i8> %y) {
%.fca.0.1.insert = insertvalue %struct.uint8x8x2_t %.fca.0.0.insert, <8 x i8> %vuzp1.i, 0, 1
ret %struct.uint8x8x2_t %.fca.0.1.insert
-; CHECK: dup {{d[0-9]+}}, {{v[0-9]+}}.d[1]
-; CHECK-NEXT: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-; CHECK-NEXT: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
}
diff --git a/test/CodeGen/AArch64/neon-rounding-halving-add.ll b/test/CodeGen/AArch64/neon-rounding-halving-add.ll
deleted file mode 100644
index 009da3b..0000000
--- a/test/CodeGen/AArch64/neon-rounding-halving-add.ll
+++ /dev/null
@@ -1,105 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-declare <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_urhadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_urhadd_v8i8:
- %tmp1 = call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: urhadd v0.8b, v0.8b, v1.8b
- ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_srhadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_srhadd_v8i8:
- %tmp1 = call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: srhadd v0.8b, v0.8b, v1.8b
- ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_urhadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_urhadd_v16i8:
- %tmp1 = call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: urhadd v0.16b, v0.16b, v1.16b
- ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_srhadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_srhadd_v16i8:
- %tmp1 = call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: srhadd v0.16b, v0.16b, v1.16b
- ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_urhadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_urhadd_v4i16:
- %tmp1 = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: urhadd v0.4h, v0.4h, v1.4h
- ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_srhadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_srhadd_v4i16:
- %tmp1 = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: srhadd v0.4h, v0.4h, v1.4h
- ret <4 x i16> %tmp1
-}
-
-declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_urhadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_urhadd_v8i16:
- %tmp1 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: urhadd v0.8h, v0.8h, v1.8h
- ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_srhadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_srhadd_v8i16:
- %tmp1 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: srhadd v0.8h, v0.8h, v1.8h
- ret <8 x i16> %tmp1
-}
-
-declare <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_urhadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_urhadd_v2i32:
- %tmp1 = call <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: urhadd v0.2s, v0.2s, v1.2s
- ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_srhadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_srhadd_v2i32:
- %tmp1 = call <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: srhadd v0.2s, v0.2s, v1.2s
- ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_urhadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_urhadd_v4i32:
- %tmp1 = call <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: urhadd v0.4s, v0.4s, v1.4s
- ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_srhadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_srhadd_v4i32:
- %tmp1 = call <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: srhadd v0.4s, v0.4s, v1.4s
- ret <4 x i32> %tmp1
-}
-
-
diff --git a/test/CodeGen/AArch64/neon-rounding-shift.ll b/test/CodeGen/AArch64/neon-rounding-shift.ll
deleted file mode 100644
index 5b4ec28..0000000
--- a/test/CodeGen/AArch64/neon-rounding-shift.ll
+++ /dev/null
@@ -1,121 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-declare <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_urshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_urshl_v8i8:
- %tmp1 = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: urshl v0.8b, v0.8b, v1.8b
- ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_srshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_srshl_v8i8:
- %tmp1 = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: srshl v0.8b, v0.8b, v1.8b
- ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_urshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_urshl_v16i8:
- %tmp1 = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: urshl v0.16b, v0.16b, v1.16b
- ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_srshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_srshl_v16i8:
- %tmp1 = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: srshl v0.16b, v0.16b, v1.16b
- ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_urshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_urshl_v4i16:
- %tmp1 = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: urshl v0.4h, v0.4h, v1.4h
- ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_srshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_srshl_v4i16:
- %tmp1 = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: srshl v0.4h, v0.4h, v1.4h
- ret <4 x i16> %tmp1
-}
-
-declare <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_urshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_urshl_v8i16:
- %tmp1 = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: urshl v0.8h, v0.8h, v1.8h
- ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_srshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_srshl_v8i16:
- %tmp1 = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: srshl v0.8h, v0.8h, v1.8h
- ret <8 x i16> %tmp1
-}
-
-declare <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_urshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_urshl_v2i32:
- %tmp1 = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: urshl v0.2s, v0.2s, v1.2s
- ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_srshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_srshl_v2i32:
- %tmp1 = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: srshl v0.2s, v0.2s, v1.2s
- ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_urshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_urshl_v4i32:
- %tmp1 = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: urshl v0.4s, v0.4s, v1.4s
- ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_srshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_srshl_v4i32:
- %tmp1 = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: srshl v0.4s, v0.4s, v1.4s
- ret <4 x i32> %tmp1
-}
-
-declare <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64>, <2 x i64>)
-declare <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64>, <2 x i64>)
-
-define <2 x i64> @test_urshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_urshl_v2i64:
- %tmp1 = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: urshl v0.2d, v0.2d, v1.2d
- ret <2 x i64> %tmp1
-}
-
-define <2 x i64> @test_srshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_srshl_v2i64:
- %tmp1 = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: srshl v0.2d, v0.2d, v1.2d
- ret <2 x i64> %tmp1
-}
-
diff --git a/test/CodeGen/AArch64/neon-saturating-add-sub.ll b/test/CodeGen/AArch64/neon-saturating-add-sub.ll
deleted file mode 100644
index fc60d90..0000000
--- a/test/CodeGen/AArch64/neon-saturating-add-sub.ll
+++ /dev/null
@@ -1,241 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-declare <8 x i8> @llvm.arm.neon.vqaddu.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vqadds.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_uqadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_uqadd_v8i8:
- %tmp1 = call <8 x i8> @llvm.arm.neon.vqaddu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: uqadd v0.8b, v0.8b, v1.8b
- ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_sqadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_sqadd_v8i8:
- %tmp1 = call <8 x i8> @llvm.arm.neon.vqadds.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: sqadd v0.8b, v0.8b, v1.8b
- ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_uqadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_uqadd_v16i8:
- %tmp1 = call <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: uqadd v0.16b, v0.16b, v1.16b
- ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_sqadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_sqadd_v16i8:
- %tmp1 = call <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: sqadd v0.16b, v0.16b, v1.16b
- ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_uqadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_uqadd_v4i16:
- %tmp1 = call <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: uqadd v0.4h, v0.4h, v1.4h
- ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_sqadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_sqadd_v4i16:
- %tmp1 = call <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: sqadd v0.4h, v0.4h, v1.4h
- ret <4 x i16> %tmp1
-}
-
-declare <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_uqadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_uqadd_v8i16:
- %tmp1 = call <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: uqadd v0.8h, v0.8h, v1.8h
- ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_sqadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_sqadd_v8i16:
- %tmp1 = call <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: sqadd v0.8h, v0.8h, v1.8h
- ret <8 x i16> %tmp1
-}
-
-declare <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_uqadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_uqadd_v2i32:
- %tmp1 = call <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: uqadd v0.2s, v0.2s, v1.2s
- ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_sqadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_sqadd_v2i32:
- %tmp1 = call <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: sqadd v0.2s, v0.2s, v1.2s
- ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_uqadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_uqadd_v4i32:
- %tmp1 = call <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: uqadd v0.4s, v0.4s, v1.4s
- ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_sqadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_sqadd_v4i32:
- %tmp1 = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: sqadd v0.4s, v0.4s, v1.4s
- ret <4 x i32> %tmp1
-}
-
-
-
-declare <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64>, <2 x i64>)
-declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>)
-
-define <2 x i64> @test_uqadd_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_uqadd_v2i64:
- %tmp1 = call <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: uqadd v0.2d, v0.2d, v1.2d
- ret <2 x i64> %tmp1
-}
-
-define <2 x i64> @test_sqadd_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_sqadd_v2i64:
- %tmp1 = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: sqadd v0.2d, v0.2d, v1.2d
- ret <2 x i64> %tmp1
-}
-
-declare <8 x i8> @llvm.arm.neon.vqsubu.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vqsubs.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_uqsub_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_uqsub_v8i8:
- %tmp1 = call <8 x i8> @llvm.arm.neon.vqsubu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: uqsub v0.8b, v0.8b, v1.8b
- ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_sqsub_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_sqsub_v8i8:
- %tmp1 = call <8 x i8> @llvm.arm.neon.vqsubs.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: sqsub v0.8b, v0.8b, v1.8b
- ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vqsubu.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vqsubs.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_uqsub_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_uqsub_v16i8:
- %tmp1 = call <16 x i8> @llvm.arm.neon.vqsubu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: uqsub v0.16b, v0.16b, v1.16b
- ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_sqsub_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_sqsub_v16i8:
- %tmp1 = call <16 x i8> @llvm.arm.neon.vqsubs.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: sqsub v0.16b, v0.16b, v1.16b
- ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vqsubu.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_uqsub_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_uqsub_v4i16:
- %tmp1 = call <4 x i16> @llvm.arm.neon.vqsubu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: uqsub v0.4h, v0.4h, v1.4h
- ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_sqsub_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_sqsub_v4i16:
- %tmp1 = call <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: sqsub v0.4h, v0.4h, v1.4h
- ret <4 x i16> %tmp1
-}
-
-declare <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_uqsub_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_uqsub_v8i16:
- %tmp1 = call <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: uqsub v0.8h, v0.8h, v1.8h
- ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_sqsub_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_sqsub_v8i16:
- %tmp1 = call <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: sqsub v0.8h, v0.8h, v1.8h
- ret <8 x i16> %tmp1
-}
-
-declare <2 x i32> @llvm.arm.neon.vqsubu.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_uqsub_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_uqsub_v2i32:
- %tmp1 = call <2 x i32> @llvm.arm.neon.vqsubu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: uqsub v0.2s, v0.2s, v1.2s
- ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_sqsub_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_sqsub_v2i32:
- %tmp1 = call <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: sqsub v0.2s, v0.2s, v1.2s
- ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vqsubu.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_uqsub_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_uqsub_v4i32:
- %tmp1 = call <4 x i32> @llvm.arm.neon.vqsubu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: uqsub v0.4s, v0.4s, v1.4s
- ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_sqsub_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_sqsub_v4i32:
- %tmp1 = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: sqsub v0.4s, v0.4s, v1.4s
- ret <4 x i32> %tmp1
-}
-
-declare <2 x i64> @llvm.arm.neon.vqsubu.v2i64(<2 x i64>, <2 x i64>)
-declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>)
-
-define <2 x i64> @test_uqsub_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_uqsub_v2i64:
- %tmp1 = call <2 x i64> @llvm.arm.neon.vqsubu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: uqsub v0.2d, v0.2d, v1.2d
- ret <2 x i64> %tmp1
-}
-
-define <2 x i64> @test_sqsub_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_sqsub_v2i64:
- %tmp1 = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: sqsub v0.2d, v0.2d, v1.2d
- ret <2 x i64> %tmp1
-}
diff --git a/test/CodeGen/AArch64/neon-saturating-rounding-shift.ll b/test/CodeGen/AArch64/neon-saturating-rounding-shift.ll
deleted file mode 100644
index d89262c..0000000
--- a/test/CodeGen/AArch64/neon-saturating-rounding-shift.ll
+++ /dev/null
@@ -1,121 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-declare <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vqrshifts.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_uqrshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_uqrshl_v8i8:
- %tmp1 = call <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: uqrshl v0.8b, v0.8b, v1.8b
- ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_sqrshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_sqrshl_v8i8:
- %tmp1 = call <8 x i8> @llvm.arm.neon.vqrshifts.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: sqrshl v0.8b, v0.8b, v1.8b
- ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_uqrshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_uqrshl_v16i8:
- %tmp1 = call <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: uqrshl v0.16b, v0.16b, v1.16b
- ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_sqrshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_sqrshl_v16i8:
- %tmp1 = call <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: sqrshl v0.16b, v0.16b, v1.16b
- ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_uqrshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_uqrshl_v4i16:
- %tmp1 = call <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: uqrshl v0.4h, v0.4h, v1.4h
- ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_sqrshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_sqrshl_v4i16:
- %tmp1 = call <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: sqrshl v0.4h, v0.4h, v1.4h
- ret <4 x i16> %tmp1
-}
-
-declare <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_uqrshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_uqrshl_v8i16:
- %tmp1 = call <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: uqrshl v0.8h, v0.8h, v1.8h
- ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_sqrshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_sqrshl_v8i16:
- %tmp1 = call <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: sqrshl v0.8h, v0.8h, v1.8h
- ret <8 x i16> %tmp1
-}
-
-declare <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_uqrshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_uqrshl_v2i32:
- %tmp1 = call <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: uqrshl v0.2s, v0.2s, v1.2s
- ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_sqrshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_sqrshl_v2i32:
- %tmp1 = call <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: sqrshl v0.2s, v0.2s, v1.2s
- ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_uqrshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_uqrshl_v4i32:
- %tmp1 = call <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: uqrshl v0.4s, v0.4s, v1.4s
- ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_sqrshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_sqrshl_v4i32:
- %tmp1 = call <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: sqrshl v0.4s, v0.4s, v1.4s
- ret <4 x i32> %tmp1
-}
-
-declare <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64>, <2 x i64>)
-declare <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64>, <2 x i64>)
-
-define <2 x i64> @test_uqrshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_uqrshl_v2i64:
- %tmp1 = call <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: uqrshl v0.2d, v0.2d, v1.2d
- ret <2 x i64> %tmp1
-}
-
-define <2 x i64> @test_sqrshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_sqrshl_v2i64:
- %tmp1 = call <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: sqrshl v0.2d, v0.2d, v1.2d
- ret <2 x i64> %tmp1
-}
-
diff --git a/test/CodeGen/AArch64/neon-saturating-shift.ll b/test/CodeGen/AArch64/neon-saturating-shift.ll
deleted file mode 100644
index 11009fb..0000000
--- a/test/CodeGen/AArch64/neon-saturating-shift.ll
+++ /dev/null
@@ -1,121 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-declare <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_uqshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_uqshl_v8i8:
- %tmp1 = call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: uqshl v0.8b, v0.8b, v1.8b
- ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_sqshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_sqshl_v8i8:
- %tmp1 = call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: sqshl v0.8b, v0.8b, v1.8b
- ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_uqshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_uqshl_v16i8:
- %tmp1 = call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: uqshl v0.16b, v0.16b, v1.16b
- ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_sqshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_sqshl_v16i8:
- %tmp1 = call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: sqshl v0.16b, v0.16b, v1.16b
- ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_uqshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_uqshl_v4i16:
- %tmp1 = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: uqshl v0.4h, v0.4h, v1.4h
- ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_sqshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_sqshl_v4i16:
- %tmp1 = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: sqshl v0.4h, v0.4h, v1.4h
- ret <4 x i16> %tmp1
-}
-
-declare <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_uqshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_uqshl_v8i16:
- %tmp1 = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: uqshl v0.8h, v0.8h, v1.8h
- ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_sqshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_sqshl_v8i16:
- %tmp1 = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: sqshl v0.8h, v0.8h, v1.8h
- ret <8 x i16> %tmp1
-}
-
-declare <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_uqshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_uqshl_v2i32:
- %tmp1 = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: uqshl v0.2s, v0.2s, v1.2s
- ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_sqshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_sqshl_v2i32:
- %tmp1 = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: sqshl v0.2s, v0.2s, v1.2s
- ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_uqshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_uqshl_v4i32:
- %tmp1 = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: uqshl v0.4s, v0.4s, v1.4s
- ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_sqshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_sqshl_v4i32:
- %tmp1 = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: sqshl v0.4s, v0.4s, v1.4s
- ret <4 x i32> %tmp1
-}
-
-declare <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64>, <2 x i64>)
-declare <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64>, <2 x i64>)
-
-define <2 x i64> @test_uqshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_uqshl_v2i64:
- %tmp1 = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: uqshl v0.2d, v0.2d, v1.2d
- ret <2 x i64> %tmp1
-}
-
-define <2 x i64> @test_sqshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_sqshl_v2i64:
- %tmp1 = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: sqshl v0.2d, v0.2d, v1.2d
- ret <2 x i64> %tmp1
-}
-
diff --git a/test/CodeGen/AArch64/neon-scalar-abs.ll b/test/CodeGen/AArch64/neon-scalar-abs.ll
deleted file mode 100644
index 03a89e04..0000000
--- a/test/CodeGen/AArch64/neon-scalar-abs.ll
+++ /dev/null
@@ -1,61 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-define i64 @test_vabsd_s64(i64 %a) {
-; CHECK: test_vabsd_s64
-; CHECK: abs {{d[0-9]+}}, {{d[0-9]+}}
-entry:
- %vabs.i = insertelement <1 x i64> undef, i64 %a, i32 0
- %vabs1.i = tail call <1 x i64> @llvm.aarch64.neon.vabs(<1 x i64> %vabs.i)
- %0 = extractelement <1 x i64> %vabs1.i, i32 0
- ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vabs(<1 x i64>)
-
-define i8 @test_vqabsb_s8(i8 %a) {
-; CHECK: test_vqabsb_s8
-; CHECK: sqabs {{b[0-9]+}}, {{b[0-9]+}}
-entry:
- %vqabs.i = insertelement <1 x i8> undef, i8 %a, i32 0
- %vqabs1.i = call <1 x i8> @llvm.arm.neon.vqabs.v1i8(<1 x i8> %vqabs.i)
- %0 = extractelement <1 x i8> %vqabs1.i, i32 0
- ret i8 %0
-}
-
-declare <1 x i8> @llvm.arm.neon.vqabs.v1i8(<1 x i8>)
-
-define i16 @test_vqabsh_s16(i16 %a) {
-; CHECK: test_vqabsh_s16
-; CHECK: sqabs {{h[0-9]+}}, {{h[0-9]+}}
-entry:
- %vqabs.i = insertelement <1 x i16> undef, i16 %a, i32 0
- %vqabs1.i = call <1 x i16> @llvm.arm.neon.vqabs.v1i16(<1 x i16> %vqabs.i)
- %0 = extractelement <1 x i16> %vqabs1.i, i32 0
- ret i16 %0
-}
-
-declare <1 x i16> @llvm.arm.neon.vqabs.v1i16(<1 x i16>)
-
-define i32 @test_vqabss_s32(i32 %a) {
-; CHECK: test_vqabss_s32
-; CHECK: sqabs {{s[0-9]+}}, {{s[0-9]+}}
-entry:
- %vqabs.i = insertelement <1 x i32> undef, i32 %a, i32 0
- %vqabs1.i = call <1 x i32> @llvm.arm.neon.vqabs.v1i32(<1 x i32> %vqabs.i)
- %0 = extractelement <1 x i32> %vqabs1.i, i32 0
- ret i32 %0
-}
-
-declare <1 x i32> @llvm.arm.neon.vqabs.v1i32(<1 x i32>)
-
-define i64 @test_vqabsd_s64(i64 %a) {
-; CHECK: test_vqabsd_s64
-; CHECK: sqabs {{d[0-9]+}}, {{d[0-9]+}}
-entry:
- %vqabs.i = insertelement <1 x i64> undef, i64 %a, i32 0
- %vqabs1.i = call <1 x i64> @llvm.arm.neon.vqabs.v1i64(<1 x i64> %vqabs.i)
- %0 = extractelement <1 x i64> %vqabs1.i, i32 0
- ret i64 %0
-}
-
-declare <1 x i64> @llvm.arm.neon.vqabs.v1i64(<1 x i64>)
diff --git a/test/CodeGen/AArch64/neon-scalar-add-sub.ll b/test/CodeGen/AArch64/neon-scalar-add-sub.ll
deleted file mode 100644
index 4f322e0..0000000
--- a/test/CodeGen/AArch64/neon-scalar-add-sub.ll
+++ /dev/null
@@ -1,50 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-define <1 x i64> @add1xi64(<1 x i64> %A, <1 x i64> %B) {
-;CHECK: add {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
- %tmp3 = add <1 x i64> %A, %B;
- ret <1 x i64> %tmp3
-}
-
-define <1 x i64> @sub1xi64(<1 x i64> %A, <1 x i64> %B) {
-;CHECK: sub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
- %tmp3 = sub <1 x i64> %A, %B;
- ret <1 x i64> %tmp3
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vaddds(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vadddu(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_add_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_add_v1i64:
- %tmp1 = call <1 x i64> @llvm.aarch64.neon.vaddds(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: add {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
- ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_uadd_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_uadd_v1i64:
- %tmp1 = call <1 x i64> @llvm.aarch64.neon.vadddu(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: add {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
- ret <1 x i64> %tmp1
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vsubds(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vsubdu(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_sub_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_sub_v1i64:
- %tmp1 = call <1 x i64> @llvm.aarch64.neon.vsubds(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: sub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
- ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_usub_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_usub_v1i64:
- %tmp1 = call <1 x i64> @llvm.aarch64.neon.vsubdu(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: sub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
- ret <1 x i64> %tmp1
-}
-
-
-
diff --git a/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll b/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll
index 247514c..32f5962 100644
--- a/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll
+++ b/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll
@@ -4,7 +4,7 @@ declare float @llvm.fma.f32(float, float, float)
declare double @llvm.fma.f64(double, double, double)
define float @test_fmla_ss4S(float %a, float %b, <4 x float> %v) {
- ; CHECK: test_fmla_ss4S
+ ; CHECK-LABEL: test_fmla_ss4S
; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
%tmp1 = extractelement <4 x float> %v, i32 3
%tmp2 = call float @llvm.fma.f32(float %b, float %tmp1, float %a)
@@ -12,7 +12,7 @@ define float @test_fmla_ss4S(float %a, float %b, <4 x float> %v) {
}
define float @test_fmla_ss4S_swap(float %a, float %b, <4 x float> %v) {
- ; CHECK: test_fmla_ss4S_swap
+ ; CHECK-LABEL: test_fmla_ss4S_swap
; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
%tmp1 = extractelement <4 x float> %v, i32 3
%tmp2 = call float @llvm.fma.f32(float %tmp1, float %a, float %a)
@@ -20,7 +20,7 @@ define float @test_fmla_ss4S_swap(float %a, float %b, <4 x float> %v) {
}
define float @test_fmla_ss2S(float %a, float %b, <2 x float> %v) {
- ; CHECK: test_fmla_ss2S
+ ; CHECK-LABEL: test_fmla_ss2S
; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
%tmp1 = extractelement <2 x float> %v, i32 1
%tmp2 = call float @llvm.fma.f32(float %b, float %tmp1, float %a)
@@ -28,15 +28,15 @@ define float @test_fmla_ss2S(float %a, float %b, <2 x float> %v) {
}
define double @test_fmla_ddD(double %a, double %b, <1 x double> %v) {
- ; CHECK: test_fmla_ddD
- ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
+ ; CHECK-LABEL: test_fmla_ddD
+ ; CHECK: {{fmla d[0-9]+, d[0-9]+, v[0-9]+.d\[0]|fmadd d[0-9]+, d[0-9]+, d[0-9]+, d[0-9]+}}
%tmp1 = extractelement <1 x double> %v, i32 0
%tmp2 = call double @llvm.fma.f64(double %b, double %tmp1, double %a)
ret double %tmp2
}
define double @test_fmla_dd2D(double %a, double %b, <2 x double> %v) {
- ; CHECK: test_fmla_dd2D
+ ; CHECK-LABEL: test_fmla_dd2D
; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
%tmp1 = extractelement <2 x double> %v, i32 1
%tmp2 = call double @llvm.fma.f64(double %b, double %tmp1, double %a)
@@ -44,7 +44,7 @@ define double @test_fmla_dd2D(double %a, double %b, <2 x double> %v) {
}
define double @test_fmla_dd2D_swap(double %a, double %b, <2 x double> %v) {
- ; CHECK: test_fmla_dd2D_swap
+ ; CHECK-LABEL: test_fmla_dd2D_swap
; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
%tmp1 = extractelement <2 x double> %v, i32 1
%tmp2 = call double @llvm.fma.f64(double %tmp1, double %b, double %a)
@@ -52,7 +52,7 @@ define double @test_fmla_dd2D_swap(double %a, double %b, <2 x double> %v) {
}
define float @test_fmls_ss4S(float %a, float %b, <4 x float> %v) {
- ; CHECK: test_fmls_ss4S
+ ; CHECK-LABEL: test_fmls_ss4S
; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
%tmp1 = extractelement <4 x float> %v, i32 3
%tmp2 = fsub float -0.0, %tmp1
@@ -61,7 +61,7 @@ define float @test_fmls_ss4S(float %a, float %b, <4 x float> %v) {
}
define float @test_fmls_ss4S_swap(float %a, float %b, <4 x float> %v) {
- ; CHECK: test_fmls_ss4S_swap
+ ; CHECK-LABEL: test_fmls_ss4S_swap
; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
%tmp1 = extractelement <4 x float> %v, i32 3
%tmp2 = fsub float -0.0, %tmp1
@@ -71,7 +71,7 @@ define float @test_fmls_ss4S_swap(float %a, float %b, <4 x float> %v) {
define float @test_fmls_ss2S(float %a, float %b, <2 x float> %v) {
- ; CHECK: test_fmls_ss2S
+ ; CHECK-LABEL: test_fmls_ss2S
; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
%tmp1 = extractelement <2 x float> %v, i32 1
%tmp2 = fsub float -0.0, %tmp1
@@ -80,8 +80,8 @@ define float @test_fmls_ss2S(float %a, float %b, <2 x float> %v) {
}
define double @test_fmls_ddD(double %a, double %b, <1 x double> %v) {
- ; CHECK: test_fmls_ddD
- ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
+ ; CHECK-LABEL: test_fmls_ddD
+ ; CHECK: {{fmls d[0-9]+, d[0-9]+, v[0-9]+.d\[0]|fmsub d[0-9]+, d[0-9]+, d[0-9]+, d[0-9]+}}
%tmp1 = extractelement <1 x double> %v, i32 0
%tmp2 = fsub double -0.0, %tmp1
%tmp3 = call double @llvm.fma.f64(double %tmp2, double %tmp1, double %a)
@@ -89,7 +89,7 @@ define double @test_fmls_ddD(double %a, double %b, <1 x double> %v) {
}
define double @test_fmls_dd2D(double %a, double %b, <2 x double> %v) {
- ; CHECK: test_fmls_dd2D
+ ; CHECK-LABEL: test_fmls_dd2D
; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
%tmp1 = extractelement <2 x double> %v, i32 1
%tmp2 = fsub double -0.0, %tmp1
@@ -98,7 +98,7 @@ define double @test_fmls_dd2D(double %a, double %b, <2 x double> %v) {
}
define double @test_fmls_dd2D_swap(double %a, double %b, <2 x double> %v) {
- ; CHECK: test_fmls_dd2D_swap
+ ; CHECK-LABEL: test_fmls_dd2D_swap
; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
%tmp1 = extractelement <2 x double> %v, i32 1
%tmp2 = fsub double -0.0, %tmp1
diff --git a/test/CodeGen/AArch64/neon-scalar-compare.ll b/test/CodeGen/AArch64/neon-scalar-compare.ll
deleted file mode 100644
index e1f3964..0000000
--- a/test/CodeGen/AArch64/neon-scalar-compare.ll
+++ /dev/null
@@ -1,343 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-;; Scalar Integer Compare
-
-define i64 @test_vceqd(i64 %a, i64 %b) {
-; CHECK: test_vceqd
-; CHECK: cmeq {{d[0-9]+}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
- %vceq.i = insertelement <1 x i64> undef, i64 %a, i32 0
- %vceq1.i = insertelement <1 x i64> undef, i64 %b, i32 0
- %vceq2.i = call <1 x i64> @llvm.aarch64.neon.vceq.v1i64.v1i64.v1i64(<1 x i64> %vceq.i, <1 x i64> %vceq1.i)
- %0 = extractelement <1 x i64> %vceq2.i, i32 0
- ret i64 %0
-}
-
-define i64 @test_vceqzd(i64 %a) {
-; CHECK: test_vceqzd
-; CHECK: cmeq {{d[0-9]}}, {{d[0-9]}}, #0x0
-entry:
- %vceqz.i = insertelement <1 x i64> undef, i64 %a, i32 0
- %vceqz1.i = call <1 x i64> @llvm.aarch64.neon.vceq.v1i64.v1i64.v1i64(<1 x i64> %vceqz.i, <1 x i64> zeroinitializer)
- %0 = extractelement <1 x i64> %vceqz1.i, i32 0
- ret i64 %0
-}
-
-define i64 @test_vcged(i64 %a, i64 %b) {
-; CHECK: test_vcged
-; CHECK: cmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
- %vcge.i = insertelement <1 x i64> undef, i64 %a, i32 0
- %vcge1.i = insertelement <1 x i64> undef, i64 %b, i32 0
- %vcge2.i = call <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1i64.v1i64(<1 x i64> %vcge.i, <1 x i64> %vcge1.i)
- %0 = extractelement <1 x i64> %vcge2.i, i32 0
- ret i64 %0
-}
-
-define i64 @test_vcgezd(i64 %a) {
-; CHECK: test_vcgezd
-; CHECK: cmge {{d[0-9]}}, {{d[0-9]}}, #0x0
-entry:
- %vcgez.i = insertelement <1 x i64> undef, i64 %a, i32 0
- %vcgez1.i = call <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1i64.v1i64(<1 x i64> %vcgez.i, <1 x i64> zeroinitializer)
- %0 = extractelement <1 x i64> %vcgez1.i, i32 0
- ret i64 %0
-}
-
-define i64 @test_vcgtd(i64 %a, i64 %b) {
-; CHECK: test_vcgtd
-; CHECK: cmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
- %vcgt.i = insertelement <1 x i64> undef, i64 %a, i32 0
- %vcgt1.i = insertelement <1 x i64> undef, i64 %b, i32 0
- %vcgt2.i = call <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1i64.v1i64(<1 x i64> %vcgt.i, <1 x i64> %vcgt1.i)
- %0 = extractelement <1 x i64> %vcgt2.i, i32 0
- ret i64 %0
-}
-
-define i64 @test_vcgtzd(i64 %a) {
-; CHECK: test_vcgtzd
-; CHECK: cmgt {{d[0-9]}}, {{d[0-9]}}, #0x0
-entry:
- %vcgtz.i = insertelement <1 x i64> undef, i64 %a, i32 0
- %vcgtz1.i = call <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1i64.v1i64(<1 x i64> %vcgtz.i, <1 x i64> zeroinitializer)
- %0 = extractelement <1 x i64> %vcgtz1.i, i32 0
- ret i64 %0
-}
-
-define i64 @test_vcled(i64 %a, i64 %b) {
-; CHECK: test_vcled
-; CHECK: cmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
- %vcgt.i = insertelement <1 x i64> undef, i64 %b, i32 0
- %vcgt1.i = insertelement <1 x i64> undef, i64 %a, i32 0
- %vcgt2.i = call <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1i64.v1i64(<1 x i64> %vcgt.i, <1 x i64> %vcgt1.i)
- %0 = extractelement <1 x i64> %vcgt2.i, i32 0
- ret i64 %0
-}
-
-define i64 @test_vclezd(i64 %a) {
-; CHECK: test_vclezd
-; CHECK: cmle {{d[0-9]}}, {{d[0-9]}}, #0x0
-entry:
- %vclez.i = insertelement <1 x i64> undef, i64 %a, i32 0
- %vclez1.i = call <1 x i64> @llvm.aarch64.neon.vclez.v1i64.v1i64.v1i64(<1 x i64> %vclez.i, <1 x i64> zeroinitializer)
- %0 = extractelement <1 x i64> %vclez1.i, i32 0
- ret i64 %0
-}
-
-define i64 @test_vcltd(i64 %a, i64 %b) {
-; CHECK: test_vcltd
-; CHECK: cmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
- %vcge.i = insertelement <1 x i64> undef, i64 %b, i32 0
- %vcge1.i = insertelement <1 x i64> undef, i64 %a, i32 0
- %vcge2.i = call <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1i64.v1i64(<1 x i64> %vcge.i, <1 x i64> %vcge1.i)
- %0 = extractelement <1 x i64> %vcge2.i, i32 0
- ret i64 %0
-}
-
-define i64 @test_vcltzd(i64 %a) {
-; CHECK: test_vcltzd
-; CHECK: cmlt {{d[0-9]}}, {{d[0-9]}}, #0x0
-entry:
- %vcltz.i = insertelement <1 x i64> undef, i64 %a, i32 0
- %vcltz1.i = call <1 x i64> @llvm.aarch64.neon.vcltz.v1i64.v1i64.v1i64(<1 x i64> %vcltz.i, <1 x i64> zeroinitializer)
- %0 = extractelement <1 x i64> %vcltz1.i, i32 0
- ret i64 %0
-}
-
-define i64 @test_vtstd(i64 %a, i64 %b) {
-; CHECK: test_vtstd
-; CHECK: cmtst {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
- %vtst.i = insertelement <1 x i64> undef, i64 %a, i32 0
- %vtst1.i = insertelement <1 x i64> undef, i64 %b, i32 0
- %vtst2.i = call <1 x i64> @llvm.aarch64.neon.vtstd.v1i64.v1i64.v1i64(<1 x i64> %vtst.i, <1 x i64> %vtst1.i)
- %0 = extractelement <1 x i64> %vtst2.i, i32 0
- ret i64 %0
-}
-
-
-define <1 x i64> @test_vcage_f64(<1 x double> %a, <1 x double> %b) #0 {
-; CHECK: test_vcage_f64
-; CHECK: facge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
- %vcage2.i = tail call <1 x i64> @llvm.arm.neon.vacge.v1i64.v1f64(<1 x double> %a, <1 x double> %b) #2
- ret <1 x i64> %vcage2.i
-}
-
-define <1 x i64> @test_vcagt_f64(<1 x double> %a, <1 x double> %b) #0 {
-; CHECK: test_vcagt_f64
-; CHECK: facgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
- %vcagt2.i = tail call <1 x i64> @llvm.arm.neon.vacgt.v1i64.v1f64(<1 x double> %a, <1 x double> %b) #2
- ret <1 x i64> %vcagt2.i
-}
-
-define <1 x i64> @test_vcale_f64(<1 x double> %a, <1 x double> %b) #0 {
-; CHECK: test_vcale_f64
-; CHECK: facge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
- %vcage2.i = tail call <1 x i64> @llvm.arm.neon.vacge.v1i64.v1f64(<1 x double> %b, <1 x double> %a) #2
- ret <1 x i64> %vcage2.i
-}
-
-define <1 x i64> @test_vcalt_f64(<1 x double> %a, <1 x double> %b) #0 {
-; CHECK: test_vcalt_f64
-; CHECK: facgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
- %vcagt2.i = tail call <1 x i64> @llvm.arm.neon.vacgt.v1i64.v1f64(<1 x double> %b, <1 x double> %a) #2
- ret <1 x i64> %vcagt2.i
-}
-
-define <1 x i64> @test_vceq_s64(<1 x i64> %a, <1 x i64> %b) #0 {
-; CHECK: test_vceq_s64
-; CHECK: cmeq {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
- %cmp.i = icmp eq <1 x i64> %a, %b
- %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
- ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vceq_u64(<1 x i64> %a, <1 x i64> %b) #0 {
-; CHECK: test_vceq_u64
-; CHECK: cmeq {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
- %cmp.i = icmp eq <1 x i64> %a, %b
- %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
- ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vceq_f64(<1 x double> %a, <1 x double> %b) #0 {
-; CHECK: test_vceq_f64
-; CHECK: fcmeq {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
- %cmp.i = fcmp oeq <1 x double> %a, %b
- %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
- ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vcge_s64(<1 x i64> %a, <1 x i64> %b) #0 {
-; CHECK: test_vcge_s64
-; CHECK: cmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
- %cmp.i = icmp sge <1 x i64> %a, %b
- %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
- ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vcge_u64(<1 x i64> %a, <1 x i64> %b) #0 {
-; CHECK: test_vcge_u64
-; CHECK: cmhs {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
- %cmp.i = icmp uge <1 x i64> %a, %b
- %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
- ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vcge_f64(<1 x double> %a, <1 x double> %b) #0 {
-; CHECK: test_vcge_f64
-; CHECK: fcmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
- %cmp.i = fcmp oge <1 x double> %a, %b
- %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
- ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vcle_s64(<1 x i64> %a, <1 x i64> %b) #0 {
-; CHECK: test_vcle_s64
-; CHECK: cmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
- %cmp.i = icmp sle <1 x i64> %a, %b
- %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
- ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vcle_u64(<1 x i64> %a, <1 x i64> %b) #0 {
-; CHECK: test_vcle_u64
-; CHECK: cmhs {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
- %cmp.i = icmp ule <1 x i64> %a, %b
- %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
- ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vcle_f64(<1 x double> %a, <1 x double> %b) #0 {
-; CHECK: test_vcle_f64
-; CHECK: fcmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
- %cmp.i = fcmp ole <1 x double> %a, %b
- %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
- ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vcgt_s64(<1 x i64> %a, <1 x i64> %b) #0 {
-; CHECK: test_vcgt_s64
-; CHECK: cmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
- %cmp.i = icmp sgt <1 x i64> %a, %b
- %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
- ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vcgt_u64(<1 x i64> %a, <1 x i64> %b) #0 {
-; CHECK: test_vcgt_u64
-; CHECK: cmhi {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
- %cmp.i = icmp ugt <1 x i64> %a, %b
- %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
- ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vcgt_f64(<1 x double> %a, <1 x double> %b) #0 {
-; CHECK: test_vcgt_f64
-; CHECK: fcmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
- %cmp.i = fcmp ogt <1 x double> %a, %b
- %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
- ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vclt_s64(<1 x i64> %a, <1 x i64> %b) #0 {
-; CHECK: test_vclt_s64
-; CHECK: cmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
- %cmp.i = icmp slt <1 x i64> %a, %b
- %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
- ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vclt_u64(<1 x i64> %a, <1 x i64> %b) #0 {
-; CHECK: test_vclt_u64
-; CHECK: cmhi {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
- %cmp.i = icmp ult <1 x i64> %a, %b
- %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
- ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vclt_f64(<1 x double> %a, <1 x double> %b) #0 {
-; CHECK: test_vclt_f64
-; CHECK: fcmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
- %cmp.i = fcmp olt <1 x double> %a, %b
- %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
- ret <1 x i64> %sext.i
-}
-
-define <1 x i64> @test_vceqz_s64(<1 x i64> %a) #0 {
-; CHECK: test_vceqz_s64
-; CHECK: cmeq {{d[0-9]}}, {{d[0-9]}}, #0x0
- %1 = icmp eq <1 x i64> %a, zeroinitializer
- %vceqz.i = sext <1 x i1> %1 to <1 x i64>
- ret <1 x i64> %vceqz.i
-}
-
-define <1 x i64> @test_vceqz_u64(<1 x i64> %a) #0 {
-; CHECK: test_vceqz_u64
-; CHECK: cmeq {{d[0-9]}}, {{d[0-9]}}, #0x0
- %1 = icmp eq <1 x i64> %a, zeroinitializer
- %vceqz.i = sext <1 x i1> %1 to <1 x i64>
- ret <1 x i64> %vceqz.i
-}
-
-define <1 x i64> @test_vceqz_p64(<1 x i64> %a) #0 {
-; CHECK: test_vceqz_p64
-; CHECK: cmeq {{d[0-9]}}, {{d[0-9]}}, #0x0
- %1 = icmp eq <1 x i64> %a, zeroinitializer
- %vceqz.i = sext <1 x i1> %1 to <1 x i64>
- ret <1 x i64> %vceqz.i
-}
-
-define <2 x i64> @test_vceqzq_p64(<2 x i64> %a) #0 {
-; CHECK: test_vceqzq_p64
-; CHECK: cmeq {{v[0-9]}}.2d, {{v[0-9]}}.2d, #0
- %1 = icmp eq <2 x i64> %a, zeroinitializer
- %vceqz.i = sext <2 x i1> %1 to <2 x i64>
- ret <2 x i64> %vceqz.i
-}
-
-define <1 x i64> @test_vcgez_s64(<1 x i64> %a) #0 {
-; CHECK: test_vcgez_s64
-; CHECK: cmge {{d[0-9]}}, {{d[0-9]}}, #0x0
- %1 = icmp sge <1 x i64> %a, zeroinitializer
- %vcgez.i = sext <1 x i1> %1 to <1 x i64>
- ret <1 x i64> %vcgez.i
-}
-
-define <1 x i64> @test_vclez_s64(<1 x i64> %a) #0 {
-; CHECK: test_vclez_s64
-; CHECK: cmle {{d[0-9]}}, {{d[0-9]}}, #0x0
- %1 = icmp sle <1 x i64> %a, zeroinitializer
- %vclez.i = sext <1 x i1> %1 to <1 x i64>
- ret <1 x i64> %vclez.i
-}
-
-define <1 x i64> @test_vcgtz_s64(<1 x i64> %a) #0 {
-; CHECK: test_vcgtz_s64
-; CHECK: cmgt {{d[0-9]}}, {{d[0-9]}}, #0x0
- %1 = icmp sgt <1 x i64> %a, zeroinitializer
- %vcgtz.i = sext <1 x i1> %1 to <1 x i64>
- ret <1 x i64> %vcgtz.i
-}
-
-define <1 x i64> @test_vcltz_s64(<1 x i64> %a) #0 {
-; CHECK: test_vcltz_s64
-; CHECK: cmlt {{d[0-9]}}, {{d[0-9]}}, #0
- %1 = icmp slt <1 x i64> %a, zeroinitializer
- %vcltz.i = sext <1 x i1> %1 to <1 x i64>
- ret <1 x i64> %vcltz.i
-}
-
-declare <1 x i64> @llvm.arm.neon.vacgt.v1i64.v1f64(<1 x double>, <1 x double>)
-declare <1 x i64> @llvm.arm.neon.vacge.v1i64.v1f64(<1 x double>, <1 x double>)
-declare <1 x i64> @llvm.aarch64.neon.vtstd.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vcltz.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vchs.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vclez.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vchi.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vceq.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
diff --git a/test/CodeGen/AArch64/neon-scalar-copy.ll b/test/CodeGen/AArch64/neon-scalar-copy.ll
index fadd734..a01df32 100644
--- a/test/CodeGen/AArch64/neon-scalar-copy.ll
+++ b/test/CodeGen/AArch64/neon-scalar-copy.ll
@@ -1,103 +1,101 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s --check-prefix=CHECK
+
define float @test_dup_sv2S(<2 x float> %v) {
- ;CHECK: test_dup_sv2S
- ;CHECK: dup {{s[0-9]+}}, {{v[0-9]+}}.s[1]
+ ; CHECK-LABEL: test_dup_sv2S
+ ; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
%tmp1 = extractelement <2 x float> %v, i32 1
ret float %tmp1
}
define float @test_dup_sv2S_0(<2 x float> %v) {
- ;CHECK-LABEL: test_dup_sv2S_0
- ;CHECK-NOT: dup {{s[0-9]+}}, {{v[0-9]+}}.s[0]
- ;CHECK: ret
+ ; CHECK-LABEL: test_dup_sv2S_0
+ ; CHECK-NOT: dup {{[vsd][0-9]+}}
+ ; CHECK-NOT: ins {{[vsd][0-9]+}}
+ ; CHECK: ret
%tmp1 = extractelement <2 x float> %v, i32 0
ret float %tmp1
}
define float @test_dup_sv4S(<4 x float> %v) {
- ;CHECK-LABEL: test_dup_sv4S
- ;CHECK-NOT: dup {{s[0-9]+}}, {{v[0-9]+}}.s[0]
- ;CHECK: ret
+ ; CHECK-LABEL: test_dup_sv4S
+ ; CHECK-NOT: dup {{[vsd][0-9]+}}
+ ; CHECK-NOT: ins {{[vsd][0-9]+}}
+ ; CHECK: ret
%tmp1 = extractelement <4 x float> %v, i32 0
ret float %tmp1
}
define double @test_dup_dvD(<1 x double> %v) {
- ;CHECK: test_dup_dvD
- ;CHECK-NOT: dup {{d[0-9]+}}, {{v[0-9]+}}.d[0]
- ;CHECK: ret
+ ; CHECK-LABEL: test_dup_dvD
+ ; CHECK-NOT: dup {{[vsd][0-9]+}}
+ ; CHECK-NOT: ins {{[vsd][0-9]+}}
+ ; CHECK: ret
%tmp1 = extractelement <1 x double> %v, i32 0
ret double %tmp1
}
define double @test_dup_dv2D(<2 x double> %v) {
- ;CHECK: test_dup_dv2D
- ;CHECK: dup {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+ ; CHECK-LABEL: test_dup_dv2D
+ ; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
%tmp1 = extractelement <2 x double> %v, i32 1
ret double %tmp1
}
define double @test_dup_dv2D_0(<2 x double> %v) {
- ;CHECK: test_dup_dv2D_0
- ;CHECK-NOT: dup {{d[0-9]+}}, {{v[0-9]+}}.d[0]
- ;CHECK: ret
+ ; CHECK-LABEL: test_dup_dv2D_0
+ ; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+ ; CHECK: ret
%tmp1 = extractelement <2 x double> %v, i32 1
ret double %tmp1
}
define <1 x i8> @test_vector_dup_bv16B(<16 x i8> %v1) {
- ;CHECK: test_vector_dup_bv16B
- ;CHECK: dup {{b[0-9]+}}, {{v[0-9]+}}.b[14]
+ ; CHECK-LABEL: test_vector_dup_bv16B
%shuffle.i = shufflevector <16 x i8> %v1, <16 x i8> undef, <1 x i32> <i32 14>
ret <1 x i8> %shuffle.i
}
define <1 x i8> @test_vector_dup_bv8B(<8 x i8> %v1) {
- ;CHECK: test_vector_dup_bv8B
- ;CHECK: dup {{b[0-9]+}}, {{v[0-9]+}}.b[7]
+ ; CHECK-LABEL: test_vector_dup_bv8B
%shuffle.i = shufflevector <8 x i8> %v1, <8 x i8> undef, <1 x i32> <i32 7>
ret <1 x i8> %shuffle.i
}
define <1 x i16> @test_vector_dup_hv8H(<8 x i16> %v1) {
- ;CHECK: test_vector_dup_hv8H
- ;CHECK: dup {{h[0-9]+}}, {{v[0-9]+}}.h[7]
+ ; CHECK-LABEL: test_vector_dup_hv8H
%shuffle.i = shufflevector <8 x i16> %v1, <8 x i16> undef, <1 x i32> <i32 7>
ret <1 x i16> %shuffle.i
}
define <1 x i16> @test_vector_dup_hv4H(<4 x i16> %v1) {
- ;CHECK: test_vector_dup_hv4H
- ;CHECK: dup {{h[0-9]+}}, {{v[0-9]+}}.h[3]
+ ; CHECK-LABEL: test_vector_dup_hv4H
%shuffle.i = shufflevector <4 x i16> %v1, <4 x i16> undef, <1 x i32> <i32 3>
ret <1 x i16> %shuffle.i
}
define <1 x i32> @test_vector_dup_sv4S(<4 x i32> %v1) {
- ;CHECK: test_vector_dup_sv4S
- ;CHECK: dup {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+ ; CHECK-LABEL: test_vector_dup_sv4S
%shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <1 x i32> <i32 3>
ret <1 x i32> %shuffle
}
define <1 x i32> @test_vector_dup_sv2S(<2 x i32> %v1) {
- ;CHECK: test_vector_dup_sv2S
- ;CHECK: dup {{s[0-9]+}}, {{v[0-9]+}}.s[1]
+ ; CHECK-LABEL: test_vector_dup_sv2S
%shuffle = shufflevector <2 x i32> %v1, <2 x i32> undef, <1 x i32> <i32 1>
ret <1 x i32> %shuffle
}
define <1 x i64> @test_vector_dup_dv2D(<2 x i64> %v1) {
- ;CHECK: test_vector_dup_dv2D
- ;CHECK: dup {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+ ; CHECK-LABEL: test_vector_dup_dv2D
+ ; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #8
%shuffle.i = shufflevector <2 x i64> %v1, <2 x i64> undef, <1 x i32> <i32 1>
ret <1 x i64> %shuffle.i
}
define <1 x i64> @test_vector_copy_dup_dv2D(<1 x i64> %a, <2 x i64> %c) {
- ;CHECK: test_vector_copy_dup_dv2D
- ;CHECK: dup {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+ ; CHECK-LABEL: test_vector_copy_dup_dv2D
+ ; CHECK: {{dup|mov}} {{d[0-9]+}}, {{v[0-9]+}}.d[1]
%vget_lane = extractelement <2 x i64> %c, i32 1
%vset_lane = insertelement <1 x i64> undef, i64 %vget_lane, i32 0
ret <1 x i64> %vset_lane
diff --git a/test/CodeGen/AArch64/neon-scalar-cvt.ll b/test/CodeGen/AArch64/neon-scalar-cvt.ll
deleted file mode 100644
index 3a19bed..0000000
--- a/test/CodeGen/AArch64/neon-scalar-cvt.ll
+++ /dev/null
@@ -1,133 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-define float @test_vcvts_f32_s32(i32 %a) {
-; CHECK: test_vcvts_f32_s32
-; CHECK: scvtf {{s[0-9]+}}, {{s[0-9]+}}
-entry:
- %vcvtf.i = insertelement <1 x i32> undef, i32 %a, i32 0
- %0 = call float @llvm.aarch64.neon.vcvtint2fps.f32.v1i32(<1 x i32> %vcvtf.i)
- ret float %0
-}
-
-declare float @llvm.aarch64.neon.vcvtint2fps.f32.v1i32(<1 x i32>)
-
-define double @test_vcvtd_f64_s64(i64 %a) {
-; CHECK: test_vcvtd_f64_s64
-; CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}}
-entry:
- %vcvtf.i = insertelement <1 x i64> undef, i64 %a, i32 0
- %0 = call double @llvm.aarch64.neon.vcvtint2fps.f64.v1i64(<1 x i64> %vcvtf.i)
- ret double %0
-}
-
-declare double @llvm.aarch64.neon.vcvtint2fps.f64.v1i64(<1 x i64>)
-
-define float @test_vcvts_f32_u32(i32 %a) {
-; CHECK: test_vcvts_f32_u32
-; CHECK: ucvtf {{s[0-9]+}}, {{s[0-9]+}}
-entry:
- %vcvtf.i = insertelement <1 x i32> undef, i32 %a, i32 0
- %0 = call float @llvm.aarch64.neon.vcvtint2fpu.f32.v1i32(<1 x i32> %vcvtf.i)
- ret float %0
-}
-
-declare float @llvm.aarch64.neon.vcvtint2fpu.f32.v1i32(<1 x i32>)
-
-define double @test_vcvtd_f64_u64(i64 %a) {
-; CHECK: test_vcvtd_f64_u64
-; CHECK: ucvtf {{d[0-9]+}}, {{d[0-9]+}}
-entry:
- %vcvtf.i = insertelement <1 x i64> undef, i64 %a, i32 0
- %0 = call double @llvm.aarch64.neon.vcvtint2fpu.f64.v1i64(<1 x i64> %vcvtf.i)
- ret double %0
-}
-
-declare double @llvm.aarch64.neon.vcvtint2fpu.f64.v1i64(<1 x i64>)
-
-define float @test_vcvts_n_f32_s32(i32 %a) {
-; CHECK: test_vcvts_n_f32_s32
-; CHECK: scvtf {{s[0-9]+}}, {{s[0-9]+}}, #1
-entry:
- %vcvtf = insertelement <1 x i32> undef, i32 %a, i32 0
- %0 = call float @llvm.aarch64.neon.vcvtfxs2fp.n.f32.v1i32(<1 x i32> %vcvtf, i32 1)
- ret float %0
-}
-
-declare float @llvm.aarch64.neon.vcvtfxs2fp.n.f32.v1i32(<1 x i32>, i32)
-
-define double @test_vcvtd_n_f64_s64(i64 %a) {
-; CHECK: test_vcvtd_n_f64_s64
-; CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}}, #1
-entry:
- %vcvtf = insertelement <1 x i64> undef, i64 %a, i32 0
- %0 = call double @llvm.aarch64.neon.vcvtfxs2fp.n.f64.v1i64(<1 x i64> %vcvtf, i32 1)
- ret double %0
-}
-
-declare double @llvm.aarch64.neon.vcvtfxs2fp.n.f64.v1i64(<1 x i64>, i32)
-
-define float @test_vcvts_n_f32_u32(i32 %a) {
-; CHECK: test_vcvts_n_f32_u32
-; CHECK: ucvtf {{s[0-9]+}}, {{s[0-9]+}}, #1
-entry:
- %vcvtf = insertelement <1 x i32> undef, i32 %a, i32 0
- %0 = call float @llvm.aarch64.neon.vcvtfxu2fp.n.f32.v1i32(<1 x i32> %vcvtf, i32 1)
- ret float %0
-}
-
-declare float @llvm.aarch64.neon.vcvtfxu2fp.n.f32.v1i32(<1 x i32>, i32)
-
-define double @test_vcvtd_n_f64_u64(i64 %a) {
-; CHECK: test_vcvtd_n_f64_u64
-; CHECK: ucvtf {{d[0-9]+}}, {{d[0-9]+}}, #1
-entry:
- %vcvtf = insertelement <1 x i64> undef, i64 %a, i32 0
- %0 = call double @llvm.aarch64.neon.vcvtfxu2fp.n.f64.v1i64(<1 x i64> %vcvtf, i32 1)
- ret double %0
-}
-
-declare double @llvm.aarch64.neon.vcvtfxu2fp.n.f64.v1i64(<1 x i64>, i32)
-
-define i32 @test_vcvts_n_s32_f32(float %a) {
-; CHECK: test_vcvts_n_s32_f32
-; CHECK: fcvtzs {{s[0-9]+}}, {{s[0-9]+}}, #1
-entry:
- %fcvtzs1 = call <1 x i32> @llvm.aarch64.neon.vcvtfp2fxs.n.v1i32.f32(float %a, i32 1)
- %0 = extractelement <1 x i32> %fcvtzs1, i32 0
- ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vcvtfp2fxs.n.v1i32.f32(float, i32)
-
-define i64 @test_vcvtd_n_s64_f64(double %a) {
-; CHECK: test_vcvtd_n_s64_f64
-; CHECK: fcvtzs {{d[0-9]+}}, {{d[0-9]+}}, #1
-entry:
- %fcvtzs1 = call <1 x i64> @llvm.aarch64.neon.vcvtfp2fxs.n.v1i64.f64(double %a, i32 1)
- %0 = extractelement <1 x i64> %fcvtzs1, i32 0
- ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vcvtfp2fxs.n.v1i64.f64(double, i32)
-
-define i32 @test_vcvts_n_u32_f32(float %a) {
-; CHECK: test_vcvts_n_u32_f32
-; CHECK: fcvtzu {{s[0-9]+}}, {{s[0-9]+}}, #32
-entry:
- %fcvtzu1 = call <1 x i32> @llvm.aarch64.neon.vcvtfp2fxu.n.v1i32.f32(float %a, i32 32)
- %0 = extractelement <1 x i32> %fcvtzu1, i32 0
- ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vcvtfp2fxu.n.v1i32.f32(float, i32)
-
-define i64 @test_vcvtd_n_u64_f64(double %a) {
-; CHECK: test_vcvtd_n_u64_f64
-; CHECK: fcvtzu {{d[0-9]+}}, {{d[0-9]+}}, #64
-entry:
- %fcvtzu1 = tail call <1 x i64> @llvm.aarch64.neon.vcvtfp2fxu.n.v1i64.f64(double %a, i32 64)
- %0 = extractelement <1 x i64> %fcvtzu1, i32 0
- ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vcvtfp2fxu.n.v1i64.f64(double, i32)
diff --git a/test/CodeGen/AArch64/neon-scalar-ext.ll b/test/CodeGen/AArch64/neon-scalar-ext.ll
deleted file mode 100644
index 51dea06..0000000
--- a/test/CodeGen/AArch64/neon-scalar-ext.ll
+++ /dev/null
@@ -1,113 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-define <1 x i64> @test_zext_v1i32_v1i64(<2 x i32> %v) nounwind readnone {
-; CHECK-LABEL: test_zext_v1i32_v1i64:
-; CHECK: ushll v0.2d, v0.2s, #0
- %1 = extractelement <2 x i32> %v, i32 0
- %2 = insertelement <1 x i32> undef, i32 %1, i32 0
- %3 = zext <1 x i32> %2 to <1 x i64>
- ret <1 x i64> %3
-}
-
-define <1 x i32> @test_zext_v1i16_v1i32(<4 x i16> %v) nounwind readnone {
-; CHECK-LABEL: test_zext_v1i16_v1i32:
-; CHECK: ushll v0.4s, v0.4h, #0
- %1 = extractelement <4 x i16> %v, i32 0
- %2 = insertelement <1 x i16> undef, i16 %1, i32 0
- %3 = zext <1 x i16> %2 to <1 x i32>
- ret <1 x i32> %3
-}
-
-define <1 x i16> @test_zext_v1i8_v1i16(<8 x i8> %v) nounwind readnone {
-; CHECK-LABEL: test_zext_v1i8_v1i16:
-; CHECK: ushll v0.8h, v0.8b, #0
- %1 = extractelement <8 x i8> %v, i32 0
- %2 = insertelement <1 x i8> undef, i8 %1, i32 0
- %3 = zext <1 x i8> %2 to <1 x i16>
- ret <1 x i16> %3
-}
-
-define <1 x i32> @test_zext_v1i8_v1i32(<8 x i8> %v) nounwind readnone {
-; CHECK-LABEL: test_zext_v1i8_v1i32:
-; CHECK: dup b0, v0.b[0]
- %1 = extractelement <8 x i8> %v, i32 0
- %2 = insertelement <1 x i8> undef, i8 %1, i32 0
- %3 = zext <1 x i8> %2 to <1 x i32>
- ret <1 x i32> %3
-}
-
-define <1 x i64> @test_zext_v1i16_v1i64(<4 x i16> %v) nounwind readnone {
-; CHECK-LABEL: test_zext_v1i16_v1i64:
-; CHECK: dup h0, v0.h[0]
- %1 = extractelement <4 x i16> %v, i32 0
- %2 = insertelement <1 x i16> undef, i16 %1, i32 0
- %3 = zext <1 x i16> %2 to <1 x i64>
- ret <1 x i64> %3
-}
-
-define <1 x i64> @test_zext_v1i8_v1i64(<8 x i8> %v) nounwind readnone {
-; CHECK-LABEL: test_zext_v1i8_v1i64:
-; CHECK: dup b0, v0.b[0]
- %1 = extractelement <8 x i8> %v, i32 0
- %2 = insertelement <1 x i8> undef, i8 %1, i32 0
- %3 = zext <1 x i8> %2 to <1 x i64>
- ret <1 x i64> %3
-}
-
-define <1 x i64> @test_sext_v1i32_v1i64(<2 x i32> %v) nounwind readnone {
-; CHECK-LABEL: test_sext_v1i32_v1i64:
-; CHECK: sshll v0.2d, v0.2s, #0
- %1 = extractelement <2 x i32> %v, i32 0
- %2 = insertelement <1 x i32> undef, i32 %1, i32 0
- %3 = sext <1 x i32> %2 to <1 x i64>
- ret <1 x i64> %3
-}
-
-define <1 x i32> @test_sext_v1i16_v1i32(<4 x i16> %v) nounwind readnone {
-; CHECK-LABEL: test_sext_v1i16_v1i32:
-; CHECK: sshll v0.4s, v0.4h, #0
- %1 = extractelement <4 x i16> %v, i32 0
- %2 = insertelement <1 x i16> undef, i16 %1, i32 0
- %3 = sext <1 x i16> %2 to <1 x i32>
- ret <1 x i32> %3
-}
-
-define <1 x i16> @test_sext_v1i8_v1i16(<8 x i8> %v) nounwind readnone {
-; CHECK-LABEL: test_sext_v1i8_v1i16:
-; CHECK: sshll v0.8h, v0.8b, #0
- %1 = extractelement <8 x i8> %v, i32 0
- %2 = insertelement <1 x i8> undef, i8 %1, i32 0
- %3 = sext <1 x i8> %2 to <1 x i16>
- ret <1 x i16> %3
-}
-
-define <1 x i32> @test_sext_v1i8_v1i32(<8 x i8> %v) nounwind readnone {
-; CHECK-LABEL: test_sext_v1i8_v1i32:
-; CHECK: sshll v0.8h, v0.8b, #0
-; CHECK: sshll v0.4s, v0.4h, #0
- %1 = extractelement <8 x i8> %v, i32 0
- %2 = insertelement <1 x i8> undef, i8 %1, i32 0
- %3 = sext <1 x i8> %2 to <1 x i32>
- ret <1 x i32> %3
-}
-
-define <1 x i64> @test_sext_v1i16_v1i64(<4 x i16> %v) nounwind readnone {
-; CHECK-LABEL: test_sext_v1i16_v1i64:
-; CHECK: sshll v0.4s, v0.4h, #0
-; CHECK: sshll v0.2d, v0.2s, #0
- %1 = extractelement <4 x i16> %v, i32 0
- %2 = insertelement <1 x i16> undef, i16 %1, i32 0
- %3 = sext <1 x i16> %2 to <1 x i64>
- ret <1 x i64> %3
-}
-
-define <1 x i64> @test_sext_v1i8_v1i64(<8 x i8> %v) nounwind readnone {
-; CHECK-LABEL: test_sext_v1i8_v1i64:
-; CHECK: sshll v0.8h, v0.8b, #0
-; CHECK: sshll v0.4s, v0.4h, #0
-; CHECK: sshll v0.2d, v0.2s, #0
- %1 = extractelement <8 x i8> %v, i32 0
- %2 = insertelement <1 x i8> undef, i8 %1, i32 0
- %3 = sext <1 x i8> %2 to <1 x i64>
- ret <1 x i64> %3
-}
diff --git a/test/CodeGen/AArch64/neon-scalar-extract-narrow.ll b/test/CodeGen/AArch64/neon-scalar-extract-narrow.ll
deleted file mode 100644
index faf521b..0000000
--- a/test/CodeGen/AArch64/neon-scalar-extract-narrow.ll
+++ /dev/null
@@ -1,104 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-define i8 @test_vqmovunh_s16(i16 %a) {
-; CHECK: test_vqmovunh_s16
-; CHECK: sqxtun {{b[0-9]+}}, {{h[0-9]+}}
-entry:
- %vqmovun.i = insertelement <1 x i16> undef, i16 %a, i32 0
- %vqmovun1.i = call <1 x i8> @llvm.arm.neon.vqmovnsu.v1i8(<1 x i16> %vqmovun.i)
- %0 = extractelement <1 x i8> %vqmovun1.i, i32 0
- ret i8 %0
-}
-
-define i16 @test_vqmovuns_s32(i32 %a) {
-; CHECK: test_vqmovuns_s32
-; CHECK: sqxtun {{h[0-9]+}}, {{s[0-9]+}}
-entry:
- %vqmovun.i = insertelement <1 x i32> undef, i32 %a, i32 0
- %vqmovun1.i = call <1 x i16> @llvm.arm.neon.vqmovnsu.v1i16(<1 x i32> %vqmovun.i)
- %0 = extractelement <1 x i16> %vqmovun1.i, i32 0
- ret i16 %0
-}
-
-define i32 @test_vqmovund_s64(i64 %a) {
-; CHECK: test_vqmovund_s64
-; CHECK: sqxtun {{s[0-9]+}}, {{d[0-9]+}}
-entry:
- %vqmovun.i = insertelement <1 x i64> undef, i64 %a, i32 0
- %vqmovun1.i = call <1 x i32> @llvm.arm.neon.vqmovnsu.v1i32(<1 x i64> %vqmovun.i)
- %0 = extractelement <1 x i32> %vqmovun1.i, i32 0
- ret i32 %0
-}
-
-declare <1 x i8> @llvm.arm.neon.vqmovnsu.v1i8(<1 x i16>)
-declare <1 x i16> @llvm.arm.neon.vqmovnsu.v1i16(<1 x i32>)
-declare <1 x i32> @llvm.arm.neon.vqmovnsu.v1i32(<1 x i64>)
-
-define i8 @test_vqmovnh_s16(i16 %a) {
-; CHECK: test_vqmovnh_s16
-; CHECK: sqxtn {{b[0-9]+}}, {{h[0-9]+}}
-entry:
- %vqmovn.i = insertelement <1 x i16> undef, i16 %a, i32 0
- %vqmovn1.i = call <1 x i8> @llvm.arm.neon.vqmovns.v1i8(<1 x i16> %vqmovn.i)
- %0 = extractelement <1 x i8> %vqmovn1.i, i32 0
- ret i8 %0
-}
-
-define i16 @test_vqmovns_s32(i32 %a) {
-; CHECK: test_vqmovns_s32
-; CHECK: sqxtn {{h[0-9]+}}, {{s[0-9]+}}
-entry:
- %vqmovn.i = insertelement <1 x i32> undef, i32 %a, i32 0
- %vqmovn1.i = call <1 x i16> @llvm.arm.neon.vqmovns.v1i16(<1 x i32> %vqmovn.i)
- %0 = extractelement <1 x i16> %vqmovn1.i, i32 0
- ret i16 %0
-}
-
-define i32 @test_vqmovnd_s64(i64 %a) {
-; CHECK: test_vqmovnd_s64
-; CHECK: sqxtn {{s[0-9]+}}, {{d[0-9]+}}
-entry:
- %vqmovn.i = insertelement <1 x i64> undef, i64 %a, i32 0
- %vqmovn1.i = call <1 x i32> @llvm.arm.neon.vqmovns.v1i32(<1 x i64> %vqmovn.i)
- %0 = extractelement <1 x i32> %vqmovn1.i, i32 0
- ret i32 %0
-}
-
-declare <1 x i8> @llvm.arm.neon.vqmovns.v1i8(<1 x i16>)
-declare <1 x i16> @llvm.arm.neon.vqmovns.v1i16(<1 x i32>)
-declare <1 x i32> @llvm.arm.neon.vqmovns.v1i32(<1 x i64>)
-
-define i8 @test_vqmovnh_u16(i16 %a) {
-; CHECK: test_vqmovnh_u16
-; CHECK: uqxtn {{b[0-9]+}}, {{h[0-9]+}}
-entry:
- %vqmovn.i = insertelement <1 x i16> undef, i16 %a, i32 0
- %vqmovn1.i = call <1 x i8> @llvm.arm.neon.vqmovnu.v1i8(<1 x i16> %vqmovn.i)
- %0 = extractelement <1 x i8> %vqmovn1.i, i32 0
- ret i8 %0
-}
-
-
-define i16 @test_vqmovns_u32(i32 %a) {
-; CHECK: test_vqmovns_u32
-; CHECK: uqxtn {{h[0-9]+}}, {{s[0-9]+}}
-entry:
- %vqmovn.i = insertelement <1 x i32> undef, i32 %a, i32 0
- %vqmovn1.i = call <1 x i16> @llvm.arm.neon.vqmovnu.v1i16(<1 x i32> %vqmovn.i)
- %0 = extractelement <1 x i16> %vqmovn1.i, i32 0
- ret i16 %0
-}
-
-define i32 @test_vqmovnd_u64(i64 %a) {
-; CHECK: test_vqmovnd_u64
-; CHECK: uqxtn {{s[0-9]+}}, {{d[0-9]+}}
-entry:
- %vqmovn.i = insertelement <1 x i64> undef, i64 %a, i32 0
- %vqmovn1.i = call <1 x i32> @llvm.arm.neon.vqmovnu.v1i32(<1 x i64> %vqmovn.i)
- %0 = extractelement <1 x i32> %vqmovn1.i, i32 0
- ret i32 %0
-}
-
-declare <1 x i8> @llvm.arm.neon.vqmovnu.v1i8(<1 x i16>)
-declare <1 x i16> @llvm.arm.neon.vqmovnu.v1i16(<1 x i32>)
-declare <1 x i32> @llvm.arm.neon.vqmovnu.v1i32(<1 x i64>)
diff --git a/test/CodeGen/AArch64/neon-scalar-fabd.ll b/test/CodeGen/AArch64/neon-scalar-fabd.ll
deleted file mode 100644
index 6343310..0000000
--- a/test/CodeGen/AArch64/neon-scalar-fabd.ll
+++ /dev/null
@@ -1,20 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-define float @test_vabds_f32(float %a, float %b) {
-; CHECK-LABEL: test_vabds_f32
-; CHECK: fabd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-entry:
- %0 = call float @llvm.aarch64.neon.vabd.f32(float %a, float %a)
- ret float %0
-}
-
-define double @test_vabdd_f64(double %a, double %b) {
-; CHECK-LABEL: test_vabdd_f64
-; CHECK: fabd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-entry:
- %0 = call double @llvm.aarch64.neon.vabd.f64(double %a, double %b)
- ret double %0
-}
-
-declare double @llvm.aarch64.neon.vabd.f64(double, double)
-declare float @llvm.aarch64.neon.vabd.f32(float, float)
diff --git a/test/CodeGen/AArch64/neon-scalar-fcvt.ll b/test/CodeGen/AArch64/neon-scalar-fcvt.ll
deleted file mode 100644
index 6cf30a7..0000000
--- a/test/CodeGen/AArch64/neon-scalar-fcvt.ll
+++ /dev/null
@@ -1,233 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-;; Scalar Floating-point Convert
-
-define float @test_vcvtxn(double %a) {
-; CHECK: test_vcvtxn
-; CHECK: fcvtxn {{s[0-9]}}, {{d[0-9]}}
-entry:
- %vcvtf = call float @llvm.aarch64.neon.fcvtxn(double %a)
- ret float %vcvtf
-}
-
-declare float @llvm.aarch64.neon.fcvtxn(double)
-
-define i32 @test_vcvtass(float %a) {
-; CHECK: test_vcvtass
-; CHECK: fcvtas {{s[0-9]}}, {{s[0-9]}}
-entry:
- %vcvtas1.i = call <1 x i32> @llvm.aarch64.neon.fcvtas.v1i32.f32(float %a)
- %0 = extractelement <1 x i32> %vcvtas1.i, i32 0
- ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.fcvtas.v1i32.f32(float)
-
-define i64 @test_test_vcvtasd(double %a) {
-; CHECK: test_test_vcvtasd
-; CHECK: fcvtas {{d[0-9]}}, {{d[0-9]}}
-entry:
- %vcvtas1.i = call <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.f64(double %a)
- %0 = extractelement <1 x i64> %vcvtas1.i, i32 0
- ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.f64(double)
-
-define i32 @test_vcvtaus(float %a) {
-; CHECK: test_vcvtaus
-; CHECK: fcvtau {{s[0-9]}}, {{s[0-9]}}
-entry:
- %vcvtau1.i = call <1 x i32> @llvm.aarch64.neon.fcvtau.v1i32.f32(float %a)
- %0 = extractelement <1 x i32> %vcvtau1.i, i32 0
- ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.fcvtau.v1i32.f32(float)
-
-define i64 @test_vcvtaud(double %a) {
-; CHECK: test_vcvtaud
-; CHECK: fcvtau {{d[0-9]}}, {{d[0-9]}}
-entry:
- %vcvtau1.i = call <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.f64(double %a)
- %0 = extractelement <1 x i64> %vcvtau1.i, i32 0
- ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.f64(double)
-
-define i32 @test_vcvtmss(float %a) {
-; CHECK: test_vcvtmss
-; CHECK: fcvtms {{s[0-9]}}, {{s[0-9]}}
-entry:
- %vcvtms1.i = call <1 x i32> @llvm.aarch64.neon.fcvtms.v1i32.f32(float %a)
- %0 = extractelement <1 x i32> %vcvtms1.i, i32 0
- ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.fcvtms.v1i32.f32(float)
-
-define i64 @test_vcvtmd_s64_f64(double %a) {
-; CHECK: test_vcvtmd_s64_f64
-; CHECK: fcvtms {{d[0-9]}}, {{d[0-9]}}
-entry:
- %vcvtms1.i = call <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.f64(double %a)
- %0 = extractelement <1 x i64> %vcvtms1.i, i32 0
- ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.f64(double)
-
-define i32 @test_vcvtmus(float %a) {
-; CHECK: test_vcvtmus
-; CHECK: fcvtmu {{s[0-9]}}, {{s[0-9]}}
-entry:
- %vcvtmu1.i = call <1 x i32> @llvm.aarch64.neon.fcvtmu.v1i32.f32(float %a)
- %0 = extractelement <1 x i32> %vcvtmu1.i, i32 0
- ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.fcvtmu.v1i32.f32(float)
-
-define i64 @test_vcvtmud(double %a) {
-; CHECK: test_vcvtmud
-; CHECK: fcvtmu {{d[0-9]}}, {{d[0-9]}}
-entry:
- %vcvtmu1.i = call <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.f64(double %a)
- %0 = extractelement <1 x i64> %vcvtmu1.i, i32 0
- ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.f64(double)
-
-define i32 @test_vcvtnss(float %a) {
-; CHECK: test_vcvtnss
-; CHECK: fcvtns {{s[0-9]}}, {{s[0-9]}}
-entry:
- %vcvtns1.i = call <1 x i32> @llvm.aarch64.neon.fcvtns.v1i32.f32(float %a)
- %0 = extractelement <1 x i32> %vcvtns1.i, i32 0
- ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.fcvtns.v1i32.f32(float)
-
-define i64 @test_vcvtnd_s64_f64(double %a) {
-; CHECK: test_vcvtnd_s64_f64
-; CHECK: fcvtns {{d[0-9]}}, {{d[0-9]}}
-entry:
- %vcvtns1.i = call <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.f64(double %a)
- %0 = extractelement <1 x i64> %vcvtns1.i, i32 0
- ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.f64(double)
-
-define i32 @test_vcvtnus(float %a) {
-; CHECK: test_vcvtnus
-; CHECK: fcvtnu {{s[0-9]}}, {{s[0-9]}}
-entry:
- %vcvtnu1.i = call <1 x i32> @llvm.aarch64.neon.fcvtnu.v1i32.f32(float %a)
- %0 = extractelement <1 x i32> %vcvtnu1.i, i32 0
- ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.fcvtnu.v1i32.f32(float)
-
-define i64 @test_vcvtnud(double %a) {
-; CHECK: test_vcvtnud
-; CHECK: fcvtnu {{d[0-9]}}, {{d[0-9]}}
-entry:
- %vcvtnu1.i = call <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.f64(double %a)
- %0 = extractelement <1 x i64> %vcvtnu1.i, i32 0
- ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.f64(double)
-
-define i32 @test_vcvtpss(float %a) {
-; CHECK: test_vcvtpss
-; CHECK: fcvtps {{s[0-9]}}, {{s[0-9]}}
-entry:
- %vcvtps1.i = call <1 x i32> @llvm.aarch64.neon.fcvtps.v1i32.f32(float %a)
- %0 = extractelement <1 x i32> %vcvtps1.i, i32 0
- ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.fcvtps.v1i32.f32(float)
-
-define i64 @test_vcvtpd_s64_f64(double %a) {
-; CHECK: test_vcvtpd_s64_f64
-; CHECK: fcvtps {{d[0-9]}}, {{d[0-9]}}
-entry:
- %vcvtps1.i = call <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.f64(double %a)
- %0 = extractelement <1 x i64> %vcvtps1.i, i32 0
- ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.f64(double)
-
-define i32 @test_vcvtpus(float %a) {
-; CHECK: test_vcvtpus
-; CHECK: fcvtpu {{s[0-9]}}, {{s[0-9]}}
-entry:
- %vcvtpu1.i = call <1 x i32> @llvm.aarch64.neon.fcvtpu.v1i32.f32(float %a)
- %0 = extractelement <1 x i32> %vcvtpu1.i, i32 0
- ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.fcvtpu.v1i32.f32(float)
-
-define i64 @test_vcvtpud(double %a) {
-; CHECK: test_vcvtpud
-; CHECK: fcvtpu {{d[0-9]}}, {{d[0-9]}}
-entry:
- %vcvtpu1.i = call <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.f64(double %a)
- %0 = extractelement <1 x i64> %vcvtpu1.i, i32 0
- ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.f64(double)
-
-define i32 @test_vcvtss(float %a) {
-; CHECK: test_vcvtss
-; CHECK: fcvtzs {{s[0-9]}}, {{s[0-9]}}
-entry:
- %vcvtzs1.i = call <1 x i32> @llvm.aarch64.neon.fcvtzs.v1i32.f32(float %a)
- %0 = extractelement <1 x i32> %vcvtzs1.i, i32 0
- ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.fcvtzs.v1i32.f32(float)
-
-define i64 @test_vcvtd_s64_f64(double %a) {
-; CHECK: test_vcvtd_s64_f64
-; CHECK: fcvtzs {{d[0-9]}}, {{d[0-9]}}
-entry:
- %vcvzs1.i = call <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.f64(double %a)
- %0 = extractelement <1 x i64> %vcvzs1.i, i32 0
- ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.f64(double)
-
-define i32 @test_vcvtus(float %a) {
-; CHECK: test_vcvtus
-; CHECK: fcvtzu {{s[0-9]}}, {{s[0-9]}}
-entry:
- %vcvtzu1.i = call <1 x i32> @llvm.aarch64.neon.fcvtzu.v1i32.f32(float %a)
- %0 = extractelement <1 x i32> %vcvtzu1.i, i32 0
- ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.fcvtzu.v1i32.f32(float)
-
-define i64 @test_vcvtud(double %a) {
-; CHECK: test_vcvtud
-; CHECK: fcvtzu {{d[0-9]}}, {{d[0-9]}}
-entry:
- %vcvtzu1.i = call <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.f64(double %a)
- %0 = extractelement <1 x i64> %vcvtzu1.i, i32 0
- ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.f64(double)
diff --git a/test/CodeGen/AArch64/neon-scalar-fp-compare.ll b/test/CodeGen/AArch64/neon-scalar-fp-compare.ll
deleted file mode 100644
index e0dce13..0000000
--- a/test/CodeGen/AArch64/neon-scalar-fp-compare.ll
+++ /dev/null
@@ -1,282 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-;; Scalar Floating-point Compare
-
-define i32 @test_vceqs_f32(float %a, float %b) {
-; CHECK-LABEL: test_vceqs_f32
-; CHECK: fcmeq {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
-entry:
- %fceq2.i = call <1 x i32> @llvm.aarch64.neon.fceq.v1i32.f32.f32(float %a, float %b)
- %0 = extractelement <1 x i32> %fceq2.i, i32 0
- ret i32 %0
-}
-
-define i64 @test_vceqd_f64(double %a, double %b) {
-; CHECK-LABEL: test_vceqd_f64
-; CHECK: fcmeq {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
- %fceq2.i = call <1 x i64> @llvm.aarch64.neon.fceq.v1i64.f64.f64(double %a, double %b)
- %0 = extractelement <1 x i64> %fceq2.i, i32 0
- ret i64 %0
-}
-
-define <1 x i64> @test_vceqz_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vceqz_f64
-; CHECK: fcmeq {{d[0-9]+}}, {{d[0-9]+}}, #0.0
-entry:
- %0 = fcmp oeq <1 x double> %a, zeroinitializer
- %vceqz.i = sext <1 x i1> %0 to <1 x i64>
- ret <1 x i64> %vceqz.i
-}
-
-define i32 @test_vceqzs_f32(float %a) {
-; CHECK-LABEL: test_vceqzs_f32
-; CHECK: fcmeq {{s[0-9]}}, {{s[0-9]}}, #0.0
-entry:
- %fceq1.i = call <1 x i32> @llvm.aarch64.neon.fceq.v1i32.f32.f32(float %a, float 0.0)
- %0 = extractelement <1 x i32> %fceq1.i, i32 0
- ret i32 %0
-}
-
-define i64 @test_vceqzd_f64(double %a) {
-; CHECK-LABEL: test_vceqzd_f64
-; CHECK: fcmeq {{d[0-9]}}, {{d[0-9]}}, #0.0
-entry:
- %fceq1.i = call <1 x i64> @llvm.aarch64.neon.fceq.v1i64.f64.f32(double %a, float 0.0)
- %0 = extractelement <1 x i64> %fceq1.i, i32 0
- ret i64 %0
-}
-
-define i32 @test_vcges_f32(float %a, float %b) {
-; CHECK-LABEL: test_vcges_f32
-; CHECK: fcmge {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
-entry:
- %fcge2.i = call <1 x i32> @llvm.aarch64.neon.fcge.v1i32.f32.f32(float %a, float %b)
- %0 = extractelement <1 x i32> %fcge2.i, i32 0
- ret i32 %0
-}
-
-define i64 @test_vcged_f64(double %a, double %b) {
-; CHECK-LABEL: test_vcged_f64
-; CHECK: fcmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
- %fcge2.i = call <1 x i64> @llvm.aarch64.neon.fcge.v1i64.f64.f64(double %a, double %b)
- %0 = extractelement <1 x i64> %fcge2.i, i32 0
- ret i64 %0
-}
-
-define i32 @test_vcgezs_f32(float %a) {
-; CHECK-LABEL: test_vcgezs_f32
-; CHECK: fcmge {{s[0-9]}}, {{s[0-9]}}, #0.0
-entry:
- %fcge1.i = call <1 x i32> @llvm.aarch64.neon.fcge.v1i32.f32.f32(float %a, float 0.0)
- %0 = extractelement <1 x i32> %fcge1.i, i32 0
- ret i32 %0
-}
-
-define i64 @test_vcgezd_f64(double %a) {
-; CHECK-LABEL: test_vcgezd_f64
-; CHECK: fcmge {{d[0-9]}}, {{d[0-9]}}, #0.0
-entry:
- %fcge1.i = call <1 x i64> @llvm.aarch64.neon.fcge.v1i64.f64.f32(double %a, float 0.0)
- %0 = extractelement <1 x i64> %fcge1.i, i32 0
- ret i64 %0
-}
-
-define i32 @test_vcgts_f32(float %a, float %b) {
-; CHECK-LABEL: test_vcgts_f32
-; CHECK: fcmgt {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
-entry:
- %fcgt2.i = call <1 x i32> @llvm.aarch64.neon.fcgt.v1i32.f32.f32(float %a, float %b)
- %0 = extractelement <1 x i32> %fcgt2.i, i32 0
- ret i32 %0
-}
-
-define i64 @test_vcgtd_f64(double %a, double %b) {
-; CHECK-LABEL: test_vcgtd_f64
-; CHECK: fcmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
- %fcgt2.i = call <1 x i64> @llvm.aarch64.neon.fcgt.v1i64.f64.f64(double %a, double %b)
- %0 = extractelement <1 x i64> %fcgt2.i, i32 0
- ret i64 %0
-}
-
-define i32 @test_vcgtzs_f32(float %a) {
-; CHECK-LABEL: test_vcgtzs_f32
-; CHECK: fcmgt {{s[0-9]}}, {{s[0-9]}}, #0.0
-entry:
- %fcgt1.i = call <1 x i32> @llvm.aarch64.neon.fcgt.v1i32.f32.f32(float %a, float 0.0)
- %0 = extractelement <1 x i32> %fcgt1.i, i32 0
- ret i32 %0
-}
-
-define i64 @test_vcgtzd_f64(double %a) {
-; CHECK-LABEL: test_vcgtzd_f64
-; CHECK: fcmgt {{d[0-9]}}, {{d[0-9]}}, #0.0
-entry:
- %fcgt1.i = call <1 x i64> @llvm.aarch64.neon.fcgt.v1i64.f64.f32(double %a, float 0.0)
- %0 = extractelement <1 x i64> %fcgt1.i, i32 0
- ret i64 %0
-}
-
-define i32 @test_vcles_f32(float %a, float %b) {
-; CHECK-LABEL: test_vcles_f32
-; CHECK: fcmge {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
-entry:
- %fcge2.i = call <1 x i32> @llvm.aarch64.neon.fcge.v1i32.f32.f32(float %a, float %b)
- %0 = extractelement <1 x i32> %fcge2.i, i32 0
- ret i32 %0
-}
-
-define i64 @test_vcled_f64(double %a, double %b) {
-; CHECK-LABEL: test_vcled_f64
-; CHECK: fcmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
- %fcge2.i = call <1 x i64> @llvm.aarch64.neon.fcge.v1i64.f64.f64(double %a, double %b)
- %0 = extractelement <1 x i64> %fcge2.i, i32 0
- ret i64 %0
-}
-
-define i32 @test_vclezs_f32(float %a) {
-; CHECK-LABEL: test_vclezs_f32
-; CHECK: fcmle {{s[0-9]}}, {{s[0-9]}}, #0.0
-entry:
- %fcle1.i = call <1 x i32> @llvm.aarch64.neon.fclez.v1i32.f32.f32(float %a, float 0.0)
- %0 = extractelement <1 x i32> %fcle1.i, i32 0
- ret i32 %0
-}
-
-define i64 @test_vclezd_f64(double %a) {
-; CHECK-LABEL: test_vclezd_f64
-; CHECK: fcmle {{d[0-9]}}, {{d[0-9]}}, #0.0
-entry:
- %fcle1.i = call <1 x i64> @llvm.aarch64.neon.fclez.v1i64.f64.f32(double %a, float 0.0)
- %0 = extractelement <1 x i64> %fcle1.i, i32 0
- ret i64 %0
-}
-
-define i32 @test_vclts_f32(float %a, float %b) {
-; CHECK-LABEL: test_vclts_f32
-; CHECK: fcmgt {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
-entry:
- %fcgt2.i = call <1 x i32> @llvm.aarch64.neon.fcgt.v1i32.f32.f32(float %a, float %b)
- %0 = extractelement <1 x i32> %fcgt2.i, i32 0
- ret i32 %0
-}
-
-define i64 @test_vcltd_f64(double %a, double %b) {
-; CHECK-LABEL: test_vcltd_f64
-; CHECK: fcmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
- %fcgt2.i = call <1 x i64> @llvm.aarch64.neon.fcgt.v1i64.f64.f64(double %a, double %b)
- %0 = extractelement <1 x i64> %fcgt2.i, i32 0
- ret i64 %0
-}
-
-define i32 @test_vcltzs_f32(float %a) {
-; CHECK-LABEL: test_vcltzs_f32
-; CHECK: fcmlt {{s[0-9]}}, {{s[0-9]}}, #0.0
-entry:
- %fclt1.i = call <1 x i32> @llvm.aarch64.neon.fcltz.v1i32.f32.f32(float %a, float 0.0)
- %0 = extractelement <1 x i32> %fclt1.i, i32 0
- ret i32 %0
-}
-
-define i64 @test_vcltzd_f64(double %a) {
-; CHECK-LABEL: test_vcltzd_f64
-; CHECK: fcmlt {{d[0-9]}}, {{d[0-9]}}, #0.0
-entry:
- %fclt1.i = call <1 x i64> @llvm.aarch64.neon.fcltz.v1i64.f64.f32(double %a, float 0.0)
- %0 = extractelement <1 x i64> %fclt1.i, i32 0
- ret i64 %0
-}
-
-define i32 @test_vcages_f32(float %a, float %b) {
-; CHECK-LABEL: test_vcages_f32
-; CHECK: facge {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
-entry:
- %fcage2.i = call <1 x i32> @llvm.aarch64.neon.fcage.v1i32.f32.f32(float %a, float %b)
- %0 = extractelement <1 x i32> %fcage2.i, i32 0
- ret i32 %0
-}
-
-define i64 @test_vcaged_f64(double %a, double %b) {
-; CHECK-LABEL: test_vcaged_f64
-; CHECK: facge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
- %fcage2.i = call <1 x i64> @llvm.aarch64.neon.fcage.v1i64.f64.f64(double %a, double %b)
- %0 = extractelement <1 x i64> %fcage2.i, i32 0
- ret i64 %0
-}
-
-define i32 @test_vcagts_f32(float %a, float %b) {
-; CHECK-LABEL: test_vcagts_f32
-; CHECK: facgt {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
-entry:
- %fcagt2.i = call <1 x i32> @llvm.aarch64.neon.fcagt.v1i32.f32.f32(float %a, float %b)
- %0 = extractelement <1 x i32> %fcagt2.i, i32 0
- ret i32 %0
-}
-
-define i64 @test_vcagtd_f64(double %a, double %b) {
-; CHECK-LABEL: test_vcagtd_f64
-; CHECK: facgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
- %fcagt2.i = call <1 x i64> @llvm.aarch64.neon.fcagt.v1i64.f64.f64(double %a, double %b)
- %0 = extractelement <1 x i64> %fcagt2.i, i32 0
- ret i64 %0
-}
-
-define i32 @test_vcales_f32(float %a, float %b) {
-; CHECK-LABEL: test_vcales_f32
-; CHECK: facge {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
-entry:
- %fcage2.i = call <1 x i32> @llvm.aarch64.neon.fcage.v1i32.f32.f32(float %a, float %b)
- %0 = extractelement <1 x i32> %fcage2.i, i32 0
- ret i32 %0
-}
-
-define i64 @test_vcaled_f64(double %a, double %b) {
-; CHECK-LABEL: test_vcaled_f64
-; CHECK: facge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
- %fcage2.i = call <1 x i64> @llvm.aarch64.neon.fcage.v1i64.f64.f64(double %a, double %b)
- %0 = extractelement <1 x i64> %fcage2.i, i32 0
- ret i64 %0
-}
-
-define i32 @test_vcalts_f32(float %a, float %b) {
-; CHECK-LABEL: test_vcalts_f32
-; CHECK: facgt {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
-entry:
- %fcalt2.i = call <1 x i32> @llvm.aarch64.neon.fcagt.v1i32.f32.f32(float %a, float %b)
- %0 = extractelement <1 x i32> %fcalt2.i, i32 0
- ret i32 %0
-}
-
-define i64 @test_vcaltd_f64(double %a, double %b) {
-; CHECK-LABEL: test_vcaltd_f64
-; CHECK: facgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
-entry:
- %fcalt2.i = call <1 x i64> @llvm.aarch64.neon.fcagt.v1i64.f64.f64(double %a, double %b)
- %0 = extractelement <1 x i64> %fcalt2.i, i32 0
- ret i64 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.fceq.v1i32.f32.f32(float, float)
-declare <1 x i64> @llvm.aarch64.neon.fceq.v1i64.f64.f32(double, float)
-declare <1 x i64> @llvm.aarch64.neon.fceq.v1i64.f64.f64(double, double)
-declare <1 x i32> @llvm.aarch64.neon.fcge.v1i32.f32.f32(float, float)
-declare <1 x i64> @llvm.aarch64.neon.fcge.v1i64.f64.f32(double, float)
-declare <1 x i64> @llvm.aarch64.neon.fcge.v1i64.f64.f64(double, double)
-declare <1 x i32> @llvm.aarch64.neon.fclez.v1i32.f32.f32(float, float)
-declare <1 x i64> @llvm.aarch64.neon.fclez.v1i64.f64.f32(double, float)
-declare <1 x i32> @llvm.aarch64.neon.fcgt.v1i32.f32.f32(float, float)
-declare <1 x i64> @llvm.aarch64.neon.fcgt.v1i64.f64.f32(double, float)
-declare <1 x i64> @llvm.aarch64.neon.fcgt.v1i64.f64.f64(double, double)
-declare <1 x i32> @llvm.aarch64.neon.fcltz.v1i32.f32.f32(float, float)
-declare <1 x i64> @llvm.aarch64.neon.fcltz.v1i64.f64.f32(double, float)
-declare <1 x i32> @llvm.aarch64.neon.fcage.v1i32.f32.f32(float, float)
-declare <1 x i64> @llvm.aarch64.neon.fcage.v1i64.f64.f64(double, double)
-declare <1 x i32> @llvm.aarch64.neon.fcagt.v1i32.f32.f32(float, float)
-declare <1 x i64> @llvm.aarch64.neon.fcagt.v1i64.f64.f64(double, double)
diff --git a/test/CodeGen/AArch64/neon-scalar-mul.ll b/test/CodeGen/AArch64/neon-scalar-mul.ll
deleted file mode 100644
index 991037f..0000000
--- a/test/CodeGen/AArch64/neon-scalar-mul.ll
+++ /dev/null
@@ -1,143 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-define i16 @test_vqdmulhh_s16(i16 %a, i16 %b) {
-; CHECK: test_vqdmulhh_s16
-; CHECK: sqdmulh {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
- %1 = insertelement <1 x i16> undef, i16 %a, i32 0
- %2 = insertelement <1 x i16> undef, i16 %b, i32 0
- %3 = call <1 x i16> @llvm.arm.neon.vqdmulh.v1i16(<1 x i16> %1, <1 x i16> %2)
- %4 = extractelement <1 x i16> %3, i32 0
- ret i16 %4
-}
-
-define i32 @test_vqdmulhs_s32(i32 %a, i32 %b) {
-; CHECK: test_vqdmulhs_s32
-; CHECK: sqdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
- %1 = insertelement <1 x i32> undef, i32 %a, i32 0
- %2 = insertelement <1 x i32> undef, i32 %b, i32 0
- %3 = call <1 x i32> @llvm.arm.neon.vqdmulh.v1i32(<1 x i32> %1, <1 x i32> %2)
- %4 = extractelement <1 x i32> %3, i32 0
- ret i32 %4
-}
-
-declare <1 x i16> @llvm.arm.neon.vqdmulh.v1i16(<1 x i16>, <1 x i16>)
-declare <1 x i32> @llvm.arm.neon.vqdmulh.v1i32(<1 x i32>, <1 x i32>)
-
-define i16 @test_vqrdmulhh_s16(i16 %a, i16 %b) {
-; CHECK: test_vqrdmulhh_s16
-; CHECK: sqrdmulh {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
- %1 = insertelement <1 x i16> undef, i16 %a, i32 0
- %2 = insertelement <1 x i16> undef, i16 %b, i32 0
- %3 = call <1 x i16> @llvm.arm.neon.vqrdmulh.v1i16(<1 x i16> %1, <1 x i16> %2)
- %4 = extractelement <1 x i16> %3, i32 0
- ret i16 %4
-}
-
-define i32 @test_vqrdmulhs_s32(i32 %a, i32 %b) {
-; CHECK: test_vqrdmulhs_s32
-; CHECK: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
- %1 = insertelement <1 x i32> undef, i32 %a, i32 0
- %2 = insertelement <1 x i32> undef, i32 %b, i32 0
- %3 = call <1 x i32> @llvm.arm.neon.vqrdmulh.v1i32(<1 x i32> %1, <1 x i32> %2)
- %4 = extractelement <1 x i32> %3, i32 0
- ret i32 %4
-}
-
-declare <1 x i16> @llvm.arm.neon.vqrdmulh.v1i16(<1 x i16>, <1 x i16>)
-declare <1 x i32> @llvm.arm.neon.vqrdmulh.v1i32(<1 x i32>, <1 x i32>)
-
-define float @test_vmulxs_f32(float %a, float %b) {
-; CHECK: test_vmulxs_f32
-; CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
- %1 = call float @llvm.aarch64.neon.vmulx.f32(float %a, float %b)
- ret float %1
-}
-
-define double @test_vmulxd_f64(double %a, double %b) {
-; CHECK: test_vmulxd_f64
-; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
- %1 = call double @llvm.aarch64.neon.vmulx.f64(double %a, double %b)
- ret double %1
-}
-
-declare float @llvm.aarch64.neon.vmulx.f32(float, float)
-declare double @llvm.aarch64.neon.vmulx.f64(double, double)
-
-define i32 @test_vqdmlalh_s16(i32 %a, i16 %b, i16 %c) {
-; CHECK: test_vqdmlalh_s16
-; CHECK: sqdmlal {{s[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
-entry:
- %vqdmlal.i = insertelement <1 x i32> undef, i32 %a, i32 0
- %vqdmlal1.i = insertelement <1 x i16> undef, i16 %b, i32 0
- %vqdmlal2.i = insertelement <1 x i16> undef, i16 %c, i32 0
- %vqdmlal3.i = call <1 x i32> @llvm.aarch64.neon.vqdmlal.v1i32(<1 x i32> %vqdmlal.i, <1 x i16> %vqdmlal1.i, <1 x i16> %vqdmlal2.i)
- %0 = extractelement <1 x i32> %vqdmlal3.i, i32 0
- ret i32 %0
-}
-
-define i64 @test_vqdmlals_s32(i64 %a, i32 %b, i32 %c) {
-; CHECK: test_vqdmlals_s32
-; CHECK: sqdmlal {{d[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-entry:
- %vqdmlal.i = insertelement <1 x i64> undef, i64 %a, i32 0
- %vqdmlal1.i = insertelement <1 x i32> undef, i32 %b, i32 0
- %vqdmlal2.i = insertelement <1 x i32> undef, i32 %c, i32 0
- %vqdmlal3.i = call <1 x i64> @llvm.aarch64.neon.vqdmlal.v1i64(<1 x i64> %vqdmlal.i, <1 x i32> %vqdmlal1.i, <1 x i32> %vqdmlal2.i)
- %0 = extractelement <1 x i64> %vqdmlal3.i, i32 0
- ret i64 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vqdmlal.v1i32(<1 x i32>, <1 x i16>, <1 x i16>)
-declare <1 x i64> @llvm.aarch64.neon.vqdmlal.v1i64(<1 x i64>, <1 x i32>, <1 x i32>)
-
-define i32 @test_vqdmlslh_s16(i32 %a, i16 %b, i16 %c) {
-; CHECK: test_vqdmlslh_s16
-; CHECK: sqdmlsl {{s[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
-entry:
- %vqdmlsl.i = insertelement <1 x i32> undef, i32 %a, i32 0
- %vqdmlsl1.i = insertelement <1 x i16> undef, i16 %b, i32 0
- %vqdmlsl2.i = insertelement <1 x i16> undef, i16 %c, i32 0
- %vqdmlsl3.i = call <1 x i32> @llvm.aarch64.neon.vqdmlsl.v1i32(<1 x i32> %vqdmlsl.i, <1 x i16> %vqdmlsl1.i, <1 x i16> %vqdmlsl2.i)
- %0 = extractelement <1 x i32> %vqdmlsl3.i, i32 0
- ret i32 %0
-}
-
-define i64 @test_vqdmlsls_s32(i64 %a, i32 %b, i32 %c) {
-; CHECK: test_vqdmlsls_s32
-; CHECK: sqdmlsl {{d[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-entry:
- %vqdmlsl.i = insertelement <1 x i64> undef, i64 %a, i32 0
- %vqdmlsl1.i = insertelement <1 x i32> undef, i32 %b, i32 0
- %vqdmlsl2.i = insertelement <1 x i32> undef, i32 %c, i32 0
- %vqdmlsl3.i = call <1 x i64> @llvm.aarch64.neon.vqdmlsl.v1i64(<1 x i64> %vqdmlsl.i, <1 x i32> %vqdmlsl1.i, <1 x i32> %vqdmlsl2.i)
- %0 = extractelement <1 x i64> %vqdmlsl3.i, i32 0
- ret i64 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vqdmlsl.v1i32(<1 x i32>, <1 x i16>, <1 x i16>)
-declare <1 x i64> @llvm.aarch64.neon.vqdmlsl.v1i64(<1 x i64>, <1 x i32>, <1 x i32>)
-
-define i32 @test_vqdmullh_s16(i16 %a, i16 %b) {
-; CHECK: test_vqdmullh_s16
-; CHECK: sqdmull {{s[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
-entry:
- %vqdmull.i = insertelement <1 x i16> undef, i16 %a, i32 0
- %vqdmull1.i = insertelement <1 x i16> undef, i16 %b, i32 0
- %vqdmull2.i = call <1 x i32> @llvm.arm.neon.vqdmull.v1i32(<1 x i16> %vqdmull.i, <1 x i16> %vqdmull1.i)
- %0 = extractelement <1 x i32> %vqdmull2.i, i32 0
- ret i32 %0
-}
-
-define i64 @test_vqdmulls_s32(i32 %a, i32 %b) {
-; CHECK: test_vqdmulls_s32
-; CHECK: sqdmull {{d[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-entry:
- %vqdmull.i = insertelement <1 x i32> undef, i32 %a, i32 0
- %vqdmull1.i = insertelement <1 x i32> undef, i32 %b, i32 0
- %vqdmull2.i = call <1 x i64> @llvm.arm.neon.vqdmull.v1i64(<1 x i32> %vqdmull.i, <1 x i32> %vqdmull1.i)
- %0 = extractelement <1 x i64> %vqdmull2.i, i32 0
- ret i64 %0
-}
-
-declare <1 x i32> @llvm.arm.neon.vqdmull.v1i32(<1 x i16>, <1 x i16>)
-declare <1 x i64> @llvm.arm.neon.vqdmull.v1i64(<1 x i32>, <1 x i32>)
diff --git a/test/CodeGen/AArch64/neon-scalar-neg.ll b/test/CodeGen/AArch64/neon-scalar-neg.ll
deleted file mode 100644
index 4dc9d51..0000000
--- a/test/CodeGen/AArch64/neon-scalar-neg.ll
+++ /dev/null
@@ -1,61 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-define i64 @test_vnegd_s64(i64 %a) {
-; CHECK: test_vnegd_s64
-; CHECK: neg {{d[0-9]+}}, {{d[0-9]+}}
-entry:
- %vneg.i = insertelement <1 x i64> undef, i64 %a, i32 0
- %vneg1.i = tail call <1 x i64> @llvm.aarch64.neon.vneg(<1 x i64> %vneg.i)
- %0 = extractelement <1 x i64> %vneg1.i, i32 0
- ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vneg(<1 x i64>)
-
-define i8 @test_vqnegb_s8(i8 %a) {
-; CHECK: test_vqnegb_s8
-; CHECK: sqneg {{b[0-9]+}}, {{b[0-9]+}}
-entry:
- %vqneg.i = insertelement <1 x i8> undef, i8 %a, i32 0
- %vqneg1.i = call <1 x i8> @llvm.arm.neon.vqneg.v1i8(<1 x i8> %vqneg.i)
- %0 = extractelement <1 x i8> %vqneg1.i, i32 0
- ret i8 %0
-}
-
-declare <1 x i8> @llvm.arm.neon.vqneg.v1i8(<1 x i8>)
-
-define i16 @test_vqnegh_s16(i16 %a) {
-; CHECK: test_vqnegh_s16
-; CHECK: sqneg {{h[0-9]+}}, {{h[0-9]+}}
-entry:
- %vqneg.i = insertelement <1 x i16> undef, i16 %a, i32 0
- %vqneg1.i = call <1 x i16> @llvm.arm.neon.vqneg.v1i16(<1 x i16> %vqneg.i)
- %0 = extractelement <1 x i16> %vqneg1.i, i32 0
- ret i16 %0
-}
-
-declare <1 x i16> @llvm.arm.neon.vqneg.v1i16(<1 x i16>)
-
-define i32 @test_vqnegs_s32(i32 %a) {
-; CHECK: test_vqnegs_s32
-; CHECK: sqneg {{s[0-9]+}}, {{s[0-9]+}}
-entry:
- %vqneg.i = insertelement <1 x i32> undef, i32 %a, i32 0
- %vqneg1.i = call <1 x i32> @llvm.arm.neon.vqneg.v1i32(<1 x i32> %vqneg.i)
- %0 = extractelement <1 x i32> %vqneg1.i, i32 0
- ret i32 %0
-}
-
-declare <1 x i32> @llvm.arm.neon.vqneg.v1i32(<1 x i32>)
-
-define i64 @test_vqnegd_s64(i64 %a) {
-; CHECK: test_vqnegd_s64
-; CHECK: sqneg {{d[0-9]+}}, {{d[0-9]+}}
-entry:
- %vqneg.i = insertelement <1 x i64> undef, i64 %a, i32 0
- %vqneg1.i = call <1 x i64> @llvm.arm.neon.vqneg.v1i64(<1 x i64> %vqneg.i)
- %0 = extractelement <1 x i64> %vqneg1.i, i32 0
- ret i64 %0
-}
-
-declare <1 x i64> @llvm.arm.neon.vqneg.v1i64(<1 x i64>) \ No newline at end of file
diff --git a/test/CodeGen/AArch64/neon-scalar-recip.ll b/test/CodeGen/AArch64/neon-scalar-recip.ll
deleted file mode 100644
index 100839b..0000000
--- a/test/CodeGen/AArch64/neon-scalar-recip.ll
+++ /dev/null
@@ -1,92 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-define float @test_vrecpss_f32(float %a, float %b) {
-; CHECK: test_vrecpss_f32
-; CHECK: frecps {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
- %1 = call float @llvm.aarch64.neon.vrecps.f32(float %a, float %b)
- ret float %1
-}
-
-define double @test_vrecpsd_f64(double %a, double %b) {
-; CHECK: test_vrecpsd_f64
-; CHECK: frecps {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
- %1 = call double @llvm.aarch64.neon.vrecps.f64(double %a, double %b)
- ret double %1
-}
-
-declare float @llvm.aarch64.neon.vrecps.f32(float, float)
-declare double @llvm.aarch64.neon.vrecps.f64(double, double)
-
-define float @test_vrsqrtss_f32(float %a, float %b) {
-; CHECK: test_vrsqrtss_f32
-; CHECK: frsqrts {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
- %1 = call float @llvm.aarch64.neon.vrsqrts.f32(float %a, float %b)
- ret float %1
-}
-
-define double @test_vrsqrtsd_f64(double %a, double %b) {
-; CHECK: test_vrsqrtsd_f64
-; CHECK: frsqrts {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
- %1 = call double @llvm.aarch64.neon.vrsqrts.f64(double %a, double %b)
- ret double %1
-}
-
-declare float @llvm.aarch64.neon.vrsqrts.f32(float, float)
-declare double @llvm.aarch64.neon.vrsqrts.f64(double, double)
-
-define float @test_vrecpes_f32(float %a) {
-; CHECK: test_vrecpes_f32
-; CHECK: frecpe {{s[0-9]+}}, {{s[0-9]+}}
-entry:
- %0 = call float @llvm.aarch64.neon.vrecpe.f32(float %a)
- ret float %0
-}
-
-define double @test_vrecped_f64(double %a) {
-; CHECK: test_vrecped_f64
-; CHECK: frecpe {{d[0-9]+}}, {{d[0-9]+}}
-entry:
- %0 = call double @llvm.aarch64.neon.vrecpe.f64(double %a)
- ret double %0
-}
-
-declare float @llvm.aarch64.neon.vrecpe.f32(float)
-declare double @llvm.aarch64.neon.vrecpe.f64(double)
-
-define float @test_vrecpxs_f32(float %a) {
-; CHECK: test_vrecpxs_f32
-; CHECK: frecpx {{s[0-9]+}}, {{s[0-9]+}}
-entry:
- %0 = call float @llvm.aarch64.neon.vrecpx.f32(float %a)
- ret float %0
-}
-
-define double @test_vrecpxd_f64(double %a) {
-; CHECK: test_vrecpxd_f64
-; CHECK: frecpx {{d[0-9]+}}, {{d[0-9]+}}
-entry:
- %0 = call double @llvm.aarch64.neon.vrecpx.f64(double %a)
- ret double %0
-}
-
-declare float @llvm.aarch64.neon.vrecpx.f32(float)
-declare double @llvm.aarch64.neon.vrecpx.f64(double)
-
-define float @test_vrsqrtes_f32(float %a) {
-; CHECK: test_vrsqrtes_f32
-; CHECK: frsqrte {{s[0-9]+}}, {{s[0-9]+}}
-entry:
- %0 = call float @llvm.aarch64.neon.vrsqrte.f32(float %a)
- ret float %0
-}
-
-define double @test_vrsqrted_f64(double %a) {
-; CHECK: test_vrsqrted_f64
-; CHECK: frsqrte {{d[0-9]+}}, {{d[0-9]+}}
-entry:
- %0 = call double @llvm.aarch64.neon.vrsqrte.f64(double %a)
- ret double %0
-}
-
-declare float @llvm.aarch64.neon.vrsqrte.f32(float)
-declare double @llvm.aarch64.neon.vrsqrte.f64(double)
diff --git a/test/CodeGen/AArch64/neon-scalar-reduce-pairwise.ll b/test/CodeGen/AArch64/neon-scalar-reduce-pairwise.ll
deleted file mode 100644
index 33ce5cf..0000000
--- a/test/CodeGen/AArch64/neon-scalar-reduce-pairwise.ll
+++ /dev/null
@@ -1,215 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-declare <1 x i64> @llvm.aarch64.neon.vpadd(<2 x i64>)
-
-define <1 x i64> @test_addp_v1i64(<2 x i64> %a) {
-; CHECK: test_addp_v1i64:
-; CHECK: addp {{d[0-9]+}}, {{v[0-9]+}}.2d
- %val = call <1 x i64> @llvm.aarch64.neon.vpadd(<2 x i64> %a)
- ret <1 x i64> %val
-}
-
-declare float @llvm.aarch64.neon.vpfadd.f32.v2f32(<2 x float>)
-
-define float @test_faddp_f32(<2 x float> %a) {
-; CHECK: test_faddp_f32:
-; CHECK: faddp {{s[0-9]+}}, {{v[0-9]+}}.2s
- %val = call float @llvm.aarch64.neon.vpfadd.f32.v2f32(<2 x float> %a)
- ret float %val
-}
-
-declare double @llvm.aarch64.neon.vpfadd.f64.v2f64(<2 x double>)
-
-define double @test_faddp_f64(<2 x double> %a) {
-; CHECK: test_faddp_f64:
-; CHECK: faddp {{d[0-9]+}}, {{v[0-9]+}}.2d
- %val = call double @llvm.aarch64.neon.vpfadd.f64.v2f64(<2 x double> %a)
- ret double %val
-}
-
-
-declare float @llvm.aarch64.neon.vpmax.f32.v2f32(<2 x float>)
-
-define float @test_fmaxp_f32(<2 x float> %a) {
-; CHECK: test_fmaxp_f32:
-; CHECK: fmaxp {{s[0-9]+}}, {{v[0-9]+}}.2s
- %val = call float @llvm.aarch64.neon.vpmax.f32.v2f32(<2 x float> %a)
- ret float %val
-}
-
-declare double @llvm.aarch64.neon.vpmax.f64.v2f64(<2 x double>)
-
-define double @test_fmaxp_f64(<2 x double> %a) {
-; CHECK: test_fmaxp_f64:
-; CHECK: fmaxp {{d[0-9]+}}, {{v[0-9]+}}.2d
- %val = call double @llvm.aarch64.neon.vpmax.f64.v2f64(<2 x double> %a)
- ret double %val
-}
-
-declare float @llvm.aarch64.neon.vpmin.f32.v2f32(<2 x float>)
-
-define float @test_fminp_f32(<2 x float> %a) {
-; CHECK: test_fminp_f32:
-; CHECK: fminp {{s[0-9]+}}, {{v[0-9]+}}.2s
- %val = call float @llvm.aarch64.neon.vpmin.f32.v2f32(<2 x float> %a)
- ret float %val
-}
-
-declare double @llvm.aarch64.neon.vpmin.f64.v2f64(<2 x double>)
-
-define double @test_fminp_f64(<2 x double> %a) {
-; CHECK: test_fminp_f64:
-; CHECK: fminp {{d[0-9]+}}, {{v[0-9]+}}.2d
- %val = call double @llvm.aarch64.neon.vpmin.f64.v2f64(<2 x double> %a)
- ret double %val
-}
-
-declare float @llvm.aarch64.neon.vpfmaxnm.f32.v2f32(<2 x float>)
-
-define float @test_fmaxnmp_f32(<2 x float> %a) {
-; CHECK: test_fmaxnmp_f32:
-; CHECK: fmaxnmp {{s[0-9]+}}, {{v[0-9]+}}.2s
- %val = call float @llvm.aarch64.neon.vpfmaxnm.f32.v2f32(<2 x float> %a)
- ret float %val
-}
-
-declare double @llvm.aarch64.neon.vpfmaxnm.f64.v2f64(<2 x double>)
-
-define double @test_fmaxnmp_f64(<2 x double> %a) {
-; CHECK: test_fmaxnmp_f64:
-; CHECK: fmaxnmp {{d[0-9]+}}, {{v[0-9]+}}.2d
- %val = call double @llvm.aarch64.neon.vpfmaxnm.f64.v2f64(<2 x double> %a)
- ret double %val
-}
-
-declare float @llvm.aarch64.neon.vpfminnm.f32.v2f32(<2 x float>)
-
-define float @test_fminnmp_f32(<2 x float> %a) {
-; CHECK: test_fminnmp_f32:
-; CHECK: fminnmp {{s[0-9]+}}, {{v[0-9]+}}.2s
- %val = call float @llvm.aarch64.neon.vpfminnm.f32.v2f32(<2 x float> %a)
- ret float %val
-}
-
-declare double @llvm.aarch64.neon.vpfminnm.f64.v2f64(<2 x double>)
-
-define double @test_fminnmp_f64(<2 x double> %a) {
-; CHECK: test_fminnmp_f64:
-; CHECK: fminnmp {{d[0-9]+}}, {{v[0-9]+}}.2d
- %val = call double @llvm.aarch64.neon.vpfminnm.f64.v2f64(<2 x double> %a)
- ret double %val
-}
-
-define float @test_vaddv_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vaddv_f32
-; CHECK: faddp {{s[0-9]+}}, {{v[0-9]+}}.2s
- %1 = call float @llvm.aarch64.neon.vpfadd.f32.v2f32(<2 x float> %a)
- ret float %1
-}
-
-define float @test_vaddvq_f32(<4 x float> %a) {
-; CHECK-LABEL: test_vaddvq_f32
-; CHECK: faddp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-; CHECK: faddp {{s[0-9]+}}, {{v[0-9]+}}.2s
- %1 = call float @llvm.aarch64.neon.vpfadd.f32.v4f32(<4 x float> %a)
- ret float %1
-}
-
-define double @test_vaddvq_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vaddvq_f64
-; CHECK: faddp {{d[0-9]+}}, {{v[0-9]+}}.2d
- %1 = call double @llvm.aarch64.neon.vpfadd.f64.v2f64(<2 x double> %a)
- ret double %1
-}
-
-define float @test_vmaxv_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vmaxv_f32
-; CHECK: fmaxp {{s[0-9]+}}, {{v[0-9]+}}.2s
- %1 = call float @llvm.aarch64.neon.vpmax.f32.v2f32(<2 x float> %a)
- ret float %1
-}
-
-define double @test_vmaxvq_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vmaxvq_f64
-; CHECK: fmaxp {{d[0-9]+}}, {{v[0-9]+}}.2d
- %1 = call double @llvm.aarch64.neon.vpmax.f64.v2f64(<2 x double> %a)
- ret double %1
-}
-
-define float @test_vminv_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vminv_f32
-; CHECK: fminp {{s[0-9]+}}, {{v[0-9]+}}.2s
- %1 = call float @llvm.aarch64.neon.vpmin.f32.v2f32(<2 x float> %a)
- ret float %1
-}
-
-define double @test_vminvq_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vminvq_f64
-; CHECK: fminp {{d[0-9]+}}, {{v[0-9]+}}.2d
- %1 = call double @llvm.aarch64.neon.vpmin.f64.v2f64(<2 x double> %a)
- ret double %1
-}
-
-define double @test_vmaxnmvq_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vmaxnmvq_f64
-; CHECK: fmaxnmp {{d[0-9]+}}, {{v[0-9]+}}.2d
- %1 = call double @llvm.aarch64.neon.vpfmaxnm.f64.v2f64(<2 x double> %a)
- ret double %1
-}
-
-define float @test_vmaxnmv_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vmaxnmv_f32
-; CHECK: fmaxnmp {{s[0-9]+}}, {{v[0-9]+}}.2s
- %1 = call float @llvm.aarch64.neon.vpfmaxnm.f32.v2f32(<2 x float> %a)
- ret float %1
-}
-
-define double @test_vminnmvq_f64(<2 x double> %a) {
-; CHECK-LABEL: test_vminnmvq_f64
-; CHECK: fminnmp {{d[0-9]+}}, {{v[0-9]+}}.2d
- %1 = call double @llvm.aarch64.neon.vpfminnm.f64.v2f64(<2 x double> %a)
- ret double %1
-}
-
-define float @test_vminnmv_f32(<2 x float> %a) {
-; CHECK-LABEL: test_vminnmv_f32
-; CHECK: fminnmp {{s[0-9]+}}, {{v[0-9]+}}.2s
- %1 = call float @llvm.aarch64.neon.vpfminnm.f32.v2f32(<2 x float> %a)
- ret float %1
-}
-
-define <2 x i64> @test_vpaddq_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vpaddq_s64
-; CHECK: addp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
- %1 = call <2 x i64> @llvm.arm.neon.vpadd.v2i64(<2 x i64> %a, <2 x i64> %b)
- ret <2 x i64> %1
-}
-
-define <2 x i64> @test_vpaddq_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vpaddq_u64
-; CHECK: addp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
- %1 = call <2 x i64> @llvm.arm.neon.vpadd.v2i64(<2 x i64> %a, <2 x i64> %b)
- ret <2 x i64> %1
-}
-
-define i64 @test_vaddvq_s64(<2 x i64> %a) {
-; CHECK-LABEL: test_vaddvq_s64
-; CHECK: addp {{d[0-9]+}}, {{v[0-9]+}}.2d
- %1 = call <1 x i64> @llvm.aarch64.neon.vaddv.v1i64.v2i64(<2 x i64> %a)
- %2 = extractelement <1 x i64> %1, i32 0
- ret i64 %2
-}
-
-define i64 @test_vaddvq_u64(<2 x i64> %a) {
-; CHECK-LABEL: test_vaddvq_u64
-; CHECK: addp {{d[0-9]+}}, {{v[0-9]+}}.2d
- %1 = call <1 x i64> @llvm.aarch64.neon.vaddv.v1i64.v2i64(<2 x i64> %a)
- %2 = extractelement <1 x i64> %1, i32 0
- ret i64 %2
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vaddv.v1i64.v2i64(<2 x i64>)
-
-declare <2 x i64> @llvm.arm.neon.vpadd.v2i64(<2 x i64>, <2 x i64>)
-
-declare float @llvm.aarch64.neon.vpfadd.f32.v4f32(<4 x float>)
diff --git a/test/CodeGen/AArch64/neon-scalar-rounding-shift.ll b/test/CodeGen/AArch64/neon-scalar-rounding-shift.ll
deleted file mode 100644
index 7c9ffa0..0000000
--- a/test/CodeGen/AArch64/neon-scalar-rounding-shift.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-
-declare <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_urshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_urshl_v1i64:
- %tmp1 = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: urshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
- ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_srshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_srshl_v1i64:
- %tmp1 = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: srshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
- ret <1 x i64> %tmp1
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vrshldu(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vrshlds(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_urshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_urshl_v1i64_aarch64:
- %tmp1 = call <1 x i64> @llvm.aarch64.neon.vrshldu(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: urshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
- ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_srshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_srshl_v1i64_aarch64:
- %tmp1 = call <1 x i64> @llvm.aarch64.neon.vrshlds(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: srshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
- ret <1 x i64> %tmp1
-}
-
-
-
diff --git a/test/CodeGen/AArch64/neon-scalar-saturating-add-sub.ll b/test/CodeGen/AArch64/neon-scalar-saturating-add-sub.ll
deleted file mode 100644
index 5c010ef..0000000
--- a/test/CodeGen/AArch64/neon-scalar-saturating-add-sub.ll
+++ /dev/null
@@ -1,242 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-declare <1 x i8> @llvm.arm.neon.vqaddu.v1i8(<1 x i8>, <1 x i8>)
-declare <1 x i8> @llvm.arm.neon.vqadds.v1i8(<1 x i8>, <1 x i8>)
-
-define <1 x i8> @test_uqadd_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
-; CHECK: test_uqadd_v1i8_aarch64:
- %tmp1 = call <1 x i8> @llvm.arm.neon.vqaddu.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
-;CHECK: uqadd {{b[0-9]+}}, {{b[0-9]+}}, {{b[0-9]+}}
- ret <1 x i8> %tmp1
-}
-
-define <1 x i8> @test_sqadd_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
-; CHECK: test_sqadd_v1i8_aarch64:
- %tmp1 = call <1 x i8> @llvm.arm.neon.vqadds.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
-;CHECK: sqadd {{b[0-9]+}}, {{b[0-9]+}}, {{b[0-9]+}}
- ret <1 x i8> %tmp1
-}
-
-declare <1 x i8> @llvm.arm.neon.vqsubu.v1i8(<1 x i8>, <1 x i8>)
-declare <1 x i8> @llvm.arm.neon.vqsubs.v1i8(<1 x i8>, <1 x i8>)
-
-define <1 x i8> @test_uqsub_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
-; CHECK: test_uqsub_v1i8_aarch64:
- %tmp1 = call <1 x i8> @llvm.arm.neon.vqsubu.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
-;CHECK: uqsub {{b[0-9]+}}, {{b[0-9]+}}, {{b[0-9]+}}
- ret <1 x i8> %tmp1
-}
-
-define <1 x i8> @test_sqsub_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
-; CHECK: test_sqsub_v1i8_aarch64:
- %tmp1 = call <1 x i8> @llvm.arm.neon.vqsubs.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
-;CHECK: sqsub {{b[0-9]+}}, {{b[0-9]+}}, {{b[0-9]+}}
- ret <1 x i8> %tmp1
-}
-
-declare <1 x i16> @llvm.arm.neon.vqaddu.v1i16(<1 x i16>, <1 x i16>)
-declare <1 x i16> @llvm.arm.neon.vqadds.v1i16(<1 x i16>, <1 x i16>)
-
-define <1 x i16> @test_uqadd_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
-; CHECK: test_uqadd_v1i16_aarch64:
- %tmp1 = call <1 x i16> @llvm.arm.neon.vqaddu.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
-;CHECK: uqadd {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
- ret <1 x i16> %tmp1
-}
-
-define <1 x i16> @test_sqadd_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
-; CHECK: test_sqadd_v1i16_aarch64:
- %tmp1 = call <1 x i16> @llvm.arm.neon.vqadds.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
-;CHECK: sqadd {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
- ret <1 x i16> %tmp1
-}
-
-declare <1 x i16> @llvm.arm.neon.vqsubu.v1i16(<1 x i16>, <1 x i16>)
-declare <1 x i16> @llvm.arm.neon.vqsubs.v1i16(<1 x i16>, <1 x i16>)
-
-define <1 x i16> @test_uqsub_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
-; CHECK: test_uqsub_v1i16_aarch64:
- %tmp1 = call <1 x i16> @llvm.arm.neon.vqsubu.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
-;CHECK: uqsub {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
- ret <1 x i16> %tmp1
-}
-
-define <1 x i16> @test_sqsub_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
-; CHECK: test_sqsub_v1i16_aarch64:
- %tmp1 = call <1 x i16> @llvm.arm.neon.vqsubs.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
-;CHECK: sqsub {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
- ret <1 x i16> %tmp1
-}
-
-declare <1 x i32> @llvm.arm.neon.vqaddu.v1i32(<1 x i32>, <1 x i32>)
-declare <1 x i32> @llvm.arm.neon.vqadds.v1i32(<1 x i32>, <1 x i32>)
-
-define <1 x i32> @test_uqadd_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
-; CHECK: test_uqadd_v1i32_aarch64:
- %tmp1 = call <1 x i32> @llvm.arm.neon.vqaddu.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
-;CHECK: uqadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
- ret <1 x i32> %tmp1
-}
-
-define <1 x i32> @test_sqadd_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
-; CHECK: test_sqadd_v1i32_aarch64:
- %tmp1 = call <1 x i32> @llvm.arm.neon.vqadds.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
-;CHECK: sqadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
- ret <1 x i32> %tmp1
-}
-
-declare <1 x i32> @llvm.arm.neon.vqsubu.v1i32(<1 x i32>, <1 x i32>)
-declare <1 x i32> @llvm.arm.neon.vqsubs.v1i32(<1 x i32>, <1 x i32>)
-
-define <1 x i32> @test_uqsub_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
-; CHECK: test_uqsub_v1i32_aarch64:
- %tmp1 = call <1 x i32> @llvm.arm.neon.vqsubu.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
-;CHECK: uqsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
- ret <1 x i32> %tmp1
-}
-
-
-define <1 x i32> @test_sqsub_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
-; CHECK: test_sqsub_v1i32_aarch64:
- %tmp1 = call <1 x i32> @llvm.arm.neon.vqsubs.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
-;CHECK: sqsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
- ret <1 x i32> %tmp1
-}
-
-declare <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_uqadd_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_uqadd_v1i64_aarch64:
- %tmp1 = call <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: uqadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
- ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_sqadd_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_sqadd_v1i64_aarch64:
- %tmp1 = call <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: sqadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
- ret <1 x i64> %tmp1
-}
-
-declare <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_uqsub_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_uqsub_v1i64_aarch64:
- %tmp1 = call <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: uqsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
- ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_sqsub_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_sqsub_v1i64_aarch64:
- %tmp1 = call <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: sqsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
- ret <1 x i64> %tmp1
-}
-
-define i8 @test_vuqaddb_s8(i8 %a, i8 %b) {
-; CHECK: test_vuqaddb_s8
-; CHECK: suqadd {{b[0-9]+}}, {{b[0-9]+}}
-entry:
- %vuqadd.i = insertelement <1 x i8> undef, i8 %a, i32 0
- %vuqadd1.i = insertelement <1 x i8> undef, i8 %b, i32 0
- %vuqadd2.i = call <1 x i8> @llvm.aarch64.neon.vuqadd.v1i8(<1 x i8> %vuqadd.i, <1 x i8> %vuqadd1.i)
- %0 = extractelement <1 x i8> %vuqadd2.i, i32 0
- ret i8 %0
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vsqadd.v1i8(<1 x i8>, <1 x i8>)
-
-define i16 @test_vuqaddh_s16(i16 %a, i16 %b) {
-; CHECK: test_vuqaddh_s16
-; CHECK: suqadd {{h[0-9]+}}, {{h[0-9]+}}
-entry:
- %vuqadd.i = insertelement <1 x i16> undef, i16 %a, i32 0
- %vuqadd1.i = insertelement <1 x i16> undef, i16 %b, i32 0
- %vuqadd2.i = call <1 x i16> @llvm.aarch64.neon.vuqadd.v1i16(<1 x i16> %vuqadd.i, <1 x i16> %vuqadd1.i)
- %0 = extractelement <1 x i16> %vuqadd2.i, i32 0
- ret i16 %0
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vsqadd.v1i16(<1 x i16>, <1 x i16>)
-
-define i32 @test_vuqadds_s32(i32 %a, i32 %b) {
-; CHECK: test_vuqadds_s32
-; CHECK: suqadd {{s[0-9]+}}, {{s[0-9]+}}
-entry:
- %vuqadd.i = insertelement <1 x i32> undef, i32 %a, i32 0
- %vuqadd1.i = insertelement <1 x i32> undef, i32 %b, i32 0
- %vuqadd2.i = call <1 x i32> @llvm.aarch64.neon.vuqadd.v1i32(<1 x i32> %vuqadd.i, <1 x i32> %vuqadd1.i)
- %0 = extractelement <1 x i32> %vuqadd2.i, i32 0
- ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vsqadd.v1i32(<1 x i32>, <1 x i32>)
-
-define i64 @test_vuqaddd_s64(i64 %a, i64 %b) {
-; CHECK: test_vuqaddd_s64
-; CHECK: suqadd {{d[0-9]+}}, {{d[0-9]+}}
-entry:
- %vuqadd.i = insertelement <1 x i64> undef, i64 %a, i32 0
- %vuqadd1.i = insertelement <1 x i64> undef, i64 %b, i32 0
- %vuqadd2.i = call <1 x i64> @llvm.aarch64.neon.vuqadd.v1i64(<1 x i64> %vuqadd.i, <1 x i64> %vuqadd1.i)
- %0 = extractelement <1 x i64> %vuqadd2.i, i32 0
- ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vsqadd.v1i64(<1 x i64>, <1 x i64>)
-
-define i8 @test_vsqaddb_u8(i8 %a, i8 %b) {
-; CHECK: test_vsqaddb_u8
-; CHECK: usqadd {{b[0-9]+}}, {{b[0-9]+}}
-entry:
- %vsqadd.i = insertelement <1 x i8> undef, i8 %a, i32 0
- %vsqadd1.i = insertelement <1 x i8> undef, i8 %b, i32 0
- %vsqadd2.i = call <1 x i8> @llvm.aarch64.neon.vsqadd.v1i8(<1 x i8> %vsqadd.i, <1 x i8> %vsqadd1.i)
- %0 = extractelement <1 x i8> %vsqadd2.i, i32 0
- ret i8 %0
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vuqadd.v1i8(<1 x i8>, <1 x i8>)
-
-define i16 @test_vsqaddh_u16(i16 %a, i16 %b) {
-; CHECK: test_vsqaddh_u16
-; CHECK: usqadd {{h[0-9]+}}, {{h[0-9]+}}
-entry:
- %vsqadd.i = insertelement <1 x i16> undef, i16 %a, i32 0
- %vsqadd1.i = insertelement <1 x i16> undef, i16 %b, i32 0
- %vsqadd2.i = call <1 x i16> @llvm.aarch64.neon.vsqadd.v1i16(<1 x i16> %vsqadd.i, <1 x i16> %vsqadd1.i)
- %0 = extractelement <1 x i16> %vsqadd2.i, i32 0
- ret i16 %0
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vuqadd.v1i16(<1 x i16>, <1 x i16>)
-
-define i32 @test_vsqadds_u32(i32 %a, i32 %b) {
-; CHECK: test_vsqadds_u32
-; CHECK: usqadd {{s[0-9]+}}, {{s[0-9]+}}
-entry:
- %vsqadd.i = insertelement <1 x i32> undef, i32 %a, i32 0
- %vsqadd1.i = insertelement <1 x i32> undef, i32 %b, i32 0
- %vsqadd2.i = call <1 x i32> @llvm.aarch64.neon.vsqadd.v1i32(<1 x i32> %vsqadd.i, <1 x i32> %vsqadd1.i)
- %0 = extractelement <1 x i32> %vsqadd2.i, i32 0
- ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vuqadd.v1i32(<1 x i32>, <1 x i32>)
-
-define i64 @test_vsqaddd_u64(i64 %a, i64 %b) {
-; CHECK: test_vsqaddd_u64
-; CHECK: usqadd {{d[0-9]+}}, {{d[0-9]+}}
-entry:
- %vsqadd.i = insertelement <1 x i64> undef, i64 %a, i32 0
- %vsqadd1.i = insertelement <1 x i64> undef, i64 %b, i32 0
- %vsqadd2.i = call <1 x i64> @llvm.aarch64.neon.vsqadd.v1i64(<1 x i64> %vsqadd.i, <1 x i64> %vsqadd1.i)
- %0 = extractelement <1 x i64> %vsqadd2.i, i32 0
- ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vuqadd.v1i64(<1 x i64>, <1 x i64>)
diff --git a/test/CodeGen/AArch64/neon-scalar-saturating-rounding-shift.ll b/test/CodeGen/AArch64/neon-scalar-saturating-rounding-shift.ll
deleted file mode 100644
index dbf9669..0000000
--- a/test/CodeGen/AArch64/neon-scalar-saturating-rounding-shift.ll
+++ /dev/null
@@ -1,94 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-declare <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_uqrshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_uqrshl_v1i64:
- %tmp1 = call <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: uqrshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-
- ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_sqrshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_sqrshl_v1i64:
- %tmp1 = call <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: sqrshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
- ret <1 x i64> %tmp1
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vqrshlu.v1i8(<1 x i8>, <1 x i8>)
-declare <1 x i8> @llvm.aarch64.neon.vqrshls.v1i8(<1 x i8>, <1 x i8>)
-
-define <1 x i8> @test_uqrshl_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
-; CHECK: test_uqrshl_v1i8_aarch64:
- %tmp1 = call <1 x i8> @llvm.aarch64.neon.vqrshlu.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
-;CHECK: uqrshl {{b[0-9]+}}, {{b[0-9]+}}, {{b[0-9]+}}
-
- ret <1 x i8> %tmp1
-}
-
-define <1 x i8> @test_sqrshl_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
-; CHECK: test_sqrshl_v1i8_aarch64:
- %tmp1 = call <1 x i8> @llvm.aarch64.neon.vqrshls.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
-;CHECK: sqrshl {{b[0-9]+}}, {{b[0-9]+}}, {{b[0-9]+}}
- ret <1 x i8> %tmp1
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vqrshlu.v1i16(<1 x i16>, <1 x i16>)
-declare <1 x i16> @llvm.aarch64.neon.vqrshls.v1i16(<1 x i16>, <1 x i16>)
-
-define <1 x i16> @test_uqrshl_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
-; CHECK: test_uqrshl_v1i16_aarch64:
- %tmp1 = call <1 x i16> @llvm.aarch64.neon.vqrshlu.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
-;CHECK: uqrshl {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
-
- ret <1 x i16> %tmp1
-}
-
-define <1 x i16> @test_sqrshl_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
-; CHECK: test_sqrshl_v1i16_aarch64:
- %tmp1 = call <1 x i16> @llvm.aarch64.neon.vqrshls.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
-;CHECK: sqrshl {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
- ret <1 x i16> %tmp1
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vqrshlu.v1i32(<1 x i32>, <1 x i32>)
-declare <1 x i32> @llvm.aarch64.neon.vqrshls.v1i32(<1 x i32>, <1 x i32>)
-
-define <1 x i32> @test_uqrshl_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
-; CHECK: test_uqrshl_v1i32_aarch64:
- %tmp1 = call <1 x i32> @llvm.aarch64.neon.vqrshlu.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
-;CHECK: uqrshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-
- ret <1 x i32> %tmp1
-}
-
-define <1 x i32> @test_sqrshl_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
-; CHECK: test_sqrshl_v1i32_aarch64:
- %tmp1 = call <1 x i32> @llvm.aarch64.neon.vqrshls.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
-;CHECK: sqrshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
- ret <1 x i32> %tmp1
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vqrshlu.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vqrshls.v1i64(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_uqrshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_uqrshl_v1i64_aarch64:
- %tmp1 = call <1 x i64> @llvm.aarch64.neon.vqrshlu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: uqrshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-
- ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_sqrshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_sqrshl_v1i64_aarch64:
- %tmp1 = call <1 x i64> @llvm.aarch64.neon.vqrshls.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: sqrshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
- ret <1 x i64> %tmp1
-}
-
-
-
diff --git a/test/CodeGen/AArch64/neon-scalar-saturating-shift.ll b/test/CodeGen/AArch64/neon-scalar-saturating-shift.ll
deleted file mode 100644
index 0a1f4c9..0000000
--- a/test/CodeGen/AArch64/neon-scalar-saturating-shift.ll
+++ /dev/null
@@ -1,88 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-declare <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_uqshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_uqshl_v1i64:
- %tmp1 = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: uqshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
- ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_sqshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_sqshl_v1i64:
- %tmp1 = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: sqshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
- ret <1 x i64> %tmp1
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vqshlu.v1i8(<1 x i8>, <1 x i8>)
-declare <1 x i8> @llvm.aarch64.neon.vqshls.v1i8(<1 x i8>, <1 x i8>)
-
-define <1 x i8> @test_uqshl_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
-; CHECK: test_uqshl_v1i8_aarch64:
- %tmp1 = call <1 x i8> @llvm.aarch64.neon.vqshlu.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
-;CHECK: uqshl {{b[0-9]+}}, {{b[0-9]+}}, {{b[0-9]+}}
- ret <1 x i8> %tmp1
-}
-
-define <1 x i8> @test_sqshl_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
-; CHECK: test_sqshl_v1i8_aarch64:
- %tmp1 = call <1 x i8> @llvm.aarch64.neon.vqshls.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
-;CHECK: sqshl {{b[0-9]+}}, {{b[0-9]+}}, {{b[0-9]+}}
- ret <1 x i8> %tmp1
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vqshlu.v1i16(<1 x i16>, <1 x i16>)
-declare <1 x i16> @llvm.aarch64.neon.vqshls.v1i16(<1 x i16>, <1 x i16>)
-
-define <1 x i16> @test_uqshl_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
-; CHECK: test_uqshl_v1i16_aarch64:
- %tmp1 = call <1 x i16> @llvm.aarch64.neon.vqshlu.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
-;CHECK: uqshl {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
- ret <1 x i16> %tmp1
-}
-
-define <1 x i16> @test_sqshl_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
-; CHECK: test_sqshl_v1i16_aarch64:
- %tmp1 = call <1 x i16> @llvm.aarch64.neon.vqshls.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
-;CHECK: sqshl {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
- ret <1 x i16> %tmp1
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vqshlu.v1i32(<1 x i32>, <1 x i32>)
-declare <1 x i32> @llvm.aarch64.neon.vqshls.v1i32(<1 x i32>, <1 x i32>)
-
-define <1 x i32> @test_uqshl_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
-; CHECK: test_uqshl_v1i32_aarch64:
- %tmp1 = call <1 x i32> @llvm.aarch64.neon.vqshlu.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
-;CHECK: uqshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
- ret <1 x i32> %tmp1
-}
-
-define <1 x i32> @test_sqshl_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
-; CHECK: test_sqshl_v1i32_aarch64:
- %tmp1 = call <1 x i32> @llvm.aarch64.neon.vqshls.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
-;CHECK: sqshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
- ret <1 x i32> %tmp1
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vqshlu.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vqshls.v1i64(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_uqshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_uqshl_v1i64_aarch64:
- %tmp1 = call <1 x i64> @llvm.aarch64.neon.vqshlu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: uqshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
- ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_sqshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_sqshl_v1i64_aarch64:
- %tmp1 = call <1 x i64> @llvm.aarch64.neon.vqshls.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-;CHECK: sqshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
- ret <1 x i64> %tmp1
-}
-
-
diff --git a/test/CodeGen/AArch64/neon-scalar-shift-imm.ll b/test/CodeGen/AArch64/neon-scalar-shift-imm.ll
deleted file mode 100644
index 6224361..0000000
--- a/test/CodeGen/AArch64/neon-scalar-shift-imm.ll
+++ /dev/null
@@ -1,531 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-define i64 @test_vshrd_n_s64(i64 %a) {
-; CHECK: test_vshrd_n_s64
-; CHECK: sshr {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
- %vsshr = insertelement <1 x i64> undef, i64 %a, i32 0
- %vsshr1 = call <1 x i64> @llvm.aarch64.neon.vshrds.n(<1 x i64> %vsshr, i32 63)
- %0 = extractelement <1 x i64> %vsshr1, i32 0
- ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vshrds.n(<1 x i64>, i32)
-
-define i64 @test_vshrd_n_u64(i64 %a) {
-; CHECK: test_vshrd_n_u64
-; CHECK: ushr {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
- %vushr = insertelement <1 x i64> undef, i64 %a, i32 0
- %vushr1 = call <1 x i64> @llvm.aarch64.neon.vshrdu.n(<1 x i64> %vushr, i32 63)
- %0 = extractelement <1 x i64> %vushr1, i32 0
- ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vshrdu.n(<1 x i64>, i32)
-
-define i64 @test_vrshrd_n_s64(i64 %a) {
-; CHECK: test_vrshrd_n_s64
-; CHECK: srshr {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
- %vsrshr = insertelement <1 x i64> undef, i64 %a, i32 0
- %vsrshr1 = call <1 x i64> @llvm.aarch64.neon.vsrshr.v1i64(<1 x i64> %vsrshr, i32 63)
- %0 = extractelement <1 x i64> %vsrshr1, i32 0
- ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vsrshr.v1i64(<1 x i64>, i32)
-
-define i64 @test_vrshrd_n_u64(i64 %a) {
-; CHECK: test_vrshrd_n_u64
-; CHECK: urshr {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
- %vurshr = insertelement <1 x i64> undef, i64 %a, i32 0
- %vurshr1 = call <1 x i64> @llvm.aarch64.neon.vurshr.v1i64(<1 x i64> %vurshr, i32 63)
- %0 = extractelement <1 x i64> %vurshr1, i32 0
- ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vurshr.v1i64(<1 x i64>, i32)
-
-define i64 @test_vsrad_n_s64(i64 %a, i64 %b) {
-; CHECK: test_vsrad_n_s64
-; CHECK: ssra {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
- %vssra = insertelement <1 x i64> undef, i64 %a, i32 0
- %vssra1 = insertelement <1 x i64> undef, i64 %b, i32 0
- %vssra2 = call <1 x i64> @llvm.aarch64.neon.vsrads.n(<1 x i64> %vssra, <1 x i64> %vssra1, i32 63)
- %0 = extractelement <1 x i64> %vssra2, i32 0
- ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vsrads.n(<1 x i64>, <1 x i64>, i32)
-
-define i64 @test_vsrad_n_u64(i64 %a, i64 %b) {
-; CHECK: test_vsrad_n_u64
-; CHECK: usra {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
- %vusra = insertelement <1 x i64> undef, i64 %a, i32 0
- %vusra1 = insertelement <1 x i64> undef, i64 %b, i32 0
- %vusra2 = call <1 x i64> @llvm.aarch64.neon.vsradu.n(<1 x i64> %vusra, <1 x i64> %vusra1, i32 63)
- %0 = extractelement <1 x i64> %vusra2, i32 0
- ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vsradu.n(<1 x i64>, <1 x i64>, i32)
-
-define i64 @test_vrsrad_n_s64(i64 %a, i64 %b) {
-; CHECK: test_vrsrad_n_s64
-; CHECK: srsra {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
- %vsrsra = insertelement <1 x i64> undef, i64 %a, i32 0
- %vsrsra1 = insertelement <1 x i64> undef, i64 %b, i32 0
- %vsrsra2 = call <1 x i64> @llvm.aarch64.neon.vrsrads.n(<1 x i64> %vsrsra, <1 x i64> %vsrsra1, i32 63)
- %0 = extractelement <1 x i64> %vsrsra2, i32 0
- ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vrsrads.n(<1 x i64>, <1 x i64>, i32)
-
-define i64 @test_vrsrad_n_u64(i64 %a, i64 %b) {
-; CHECK: test_vrsrad_n_u64
-; CHECK: ursra {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
- %vursra = insertelement <1 x i64> undef, i64 %a, i32 0
- %vursra1 = insertelement <1 x i64> undef, i64 %b, i32 0
- %vursra2 = call <1 x i64> @llvm.aarch64.neon.vrsradu.n(<1 x i64> %vursra, <1 x i64> %vursra1, i32 63)
- %0 = extractelement <1 x i64> %vursra2, i32 0
- ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vrsradu.n(<1 x i64>, <1 x i64>, i32)
-
-define i64 @test_vshld_n_s64(i64 %a) {
-; CHECK: test_vshld_n_s64
-; CHECK: shl {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
- %vshl = insertelement <1 x i64> undef, i64 %a, i32 0
- %vshl1 = call <1 x i64> @llvm.aarch64.neon.vshld.n(<1 x i64> %vshl, i32 63)
- %0 = extractelement <1 x i64> %vshl1, i32 0
- ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vshld.n(<1 x i64>, i32)
-
-define i64 @test_vshld_n_u64(i64 %a) {
-; CHECK: test_vshld_n_u64
-; CHECK: shl {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
- %vshl = insertelement <1 x i64> undef, i64 %a, i32 0
- %vshl1 = call <1 x i64> @llvm.aarch64.neon.vshld.n(<1 x i64> %vshl, i32 63)
- %0 = extractelement <1 x i64> %vshl1, i32 0
- ret i64 %0
-}
-
-define i8 @test_vqshlb_n_s8(i8 %a) {
-; CHECK: test_vqshlb_n_s8
-; CHECK: sqshl {{b[0-9]+}}, {{b[0-9]+}}, #7
-entry:
- %vsqshl = insertelement <1 x i8> undef, i8 %a, i32 0
- %vsqshl1 = call <1 x i8> @llvm.aarch64.neon.vqshls.n.v1i8(<1 x i8> %vsqshl, i32 7)
- %0 = extractelement <1 x i8> %vsqshl1, i32 0
- ret i8 %0
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vqshls.n.v1i8(<1 x i8>, i32)
-
-define i16 @test_vqshlh_n_s16(i16 %a) {
-; CHECK: test_vqshlh_n_s16
-; CHECK: sqshl {{h[0-9]+}}, {{h[0-9]+}}, #15
-entry:
- %vsqshl = insertelement <1 x i16> undef, i16 %a, i32 0
- %vsqshl1 = call <1 x i16> @llvm.aarch64.neon.vqshls.n.v1i16(<1 x i16> %vsqshl, i32 15)
- %0 = extractelement <1 x i16> %vsqshl1, i32 0
- ret i16 %0
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vqshls.n.v1i16(<1 x i16>, i32)
-
-define i32 @test_vqshls_n_s32(i32 %a) {
-; CHECK: test_vqshls_n_s32
-; CHECK: sqshl {{s[0-9]+}}, {{s[0-9]+}}, #31
-entry:
- %vsqshl = insertelement <1 x i32> undef, i32 %a, i32 0
- %vsqshl1 = call <1 x i32> @llvm.aarch64.neon.vqshls.n.v1i32(<1 x i32> %vsqshl, i32 31)
- %0 = extractelement <1 x i32> %vsqshl1, i32 0
- ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vqshls.n.v1i32(<1 x i32>, i32)
-
-define i64 @test_vqshld_n_s64(i64 %a) {
-; CHECK: test_vqshld_n_s64
-; CHECK: sqshl {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
- %vsqshl = insertelement <1 x i64> undef, i64 %a, i32 0
- %vsqshl1 = call <1 x i64> @llvm.aarch64.neon.vqshls.n.v1i64(<1 x i64> %vsqshl, i32 63)
- %0 = extractelement <1 x i64> %vsqshl1, i32 0
- ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vqshls.n.v1i64(<1 x i64>, i32)
-
-define i8 @test_vqshlb_n_u8(i8 %a) {
-; CHECK: test_vqshlb_n_u8
-; CHECK: uqshl {{b[0-9]+}}, {{b[0-9]+}}, #7
-entry:
- %vuqshl = insertelement <1 x i8> undef, i8 %a, i32 0
- %vuqshl1 = call <1 x i8> @llvm.aarch64.neon.vqshlu.n.v1i8(<1 x i8> %vuqshl, i32 7)
- %0 = extractelement <1 x i8> %vuqshl1, i32 0
- ret i8 %0
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vqshlu.n.v1i8(<1 x i8>, i32)
-
-define i16 @test_vqshlh_n_u16(i16 %a) {
-; CHECK: test_vqshlh_n_u16
-; CHECK: uqshl {{h[0-9]+}}, {{h[0-9]+}}, #15
-entry:
- %vuqshl = insertelement <1 x i16> undef, i16 %a, i32 0
- %vuqshl1 = call <1 x i16> @llvm.aarch64.neon.vqshlu.n.v1i16(<1 x i16> %vuqshl, i32 15)
- %0 = extractelement <1 x i16> %vuqshl1, i32 0
- ret i16 %0
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vqshlu.n.v1i16(<1 x i16>, i32)
-
-define i32 @test_vqshls_n_u32(i32 %a) {
-; CHECK: test_vqshls_n_u32
-; CHECK: uqshl {{s[0-9]+}}, {{s[0-9]+}}, #31
-entry:
- %vuqshl = insertelement <1 x i32> undef, i32 %a, i32 0
- %vuqshl1 = call <1 x i32> @llvm.aarch64.neon.vqshlu.n.v1i32(<1 x i32> %vuqshl, i32 31)
- %0 = extractelement <1 x i32> %vuqshl1, i32 0
- ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vqshlu.n.v1i32(<1 x i32>, i32)
-
-define i64 @test_vqshld_n_u64(i64 %a) {
-; CHECK: test_vqshld_n_u64
-; CHECK: uqshl {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
- %vuqshl = insertelement <1 x i64> undef, i64 %a, i32 0
- %vuqshl1 = call <1 x i64> @llvm.aarch64.neon.vqshlu.n.v1i64(<1 x i64> %vuqshl, i32 63)
- %0 = extractelement <1 x i64> %vuqshl1, i32 0
- ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vqshlu.n.v1i64(<1 x i64>, i32)
-
-define i8 @test_vqshlub_n_s8(i8 %a) {
-; CHECK: test_vqshlub_n_s8
-; CHECK: sqshlu {{b[0-9]+}}, {{b[0-9]+}}, #7
-entry:
- %vsqshlu = insertelement <1 x i8> undef, i8 %a, i32 0
- %vsqshlu1 = call <1 x i8> @llvm.aarch64.neon.vsqshlu.v1i8(<1 x i8> %vsqshlu, i32 7)
- %0 = extractelement <1 x i8> %vsqshlu1, i32 0
- ret i8 %0
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vsqshlu.v1i8(<1 x i8>, i32)
-
-define i16 @test_vqshluh_n_s16(i16 %a) {
-; CHECK: test_vqshluh_n_s16
-; CHECK: sqshlu {{h[0-9]+}}, {{h[0-9]+}}, #15
-entry:
- %vsqshlu = insertelement <1 x i16> undef, i16 %a, i32 0
- %vsqshlu1 = call <1 x i16> @llvm.aarch64.neon.vsqshlu.v1i16(<1 x i16> %vsqshlu, i32 15)
- %0 = extractelement <1 x i16> %vsqshlu1, i32 0
- ret i16 %0
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vsqshlu.v1i16(<1 x i16>, i32)
-
-define i32 @test_vqshlus_n_s32(i32 %a) {
-; CHECK: test_vqshlus_n_s32
-; CHECK: sqshlu {{s[0-9]+}}, {{s[0-9]+}}, #31
-entry:
- %vsqshlu = insertelement <1 x i32> undef, i32 %a, i32 0
- %vsqshlu1 = call <1 x i32> @llvm.aarch64.neon.vsqshlu.v1i32(<1 x i32> %vsqshlu, i32 31)
- %0 = extractelement <1 x i32> %vsqshlu1, i32 0
- ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vsqshlu.v1i32(<1 x i32>, i32)
-
-define i64 @test_vqshlud_n_s64(i64 %a) {
-; CHECK: test_vqshlud_n_s64
-; CHECK: sqshlu {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
- %vsqshlu = insertelement <1 x i64> undef, i64 %a, i32 0
- %vsqshlu1 = call <1 x i64> @llvm.aarch64.neon.vsqshlu.v1i64(<1 x i64> %vsqshlu, i32 63)
- %0 = extractelement <1 x i64> %vsqshlu1, i32 0
- ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vsqshlu.v1i64(<1 x i64>, i32)
-
-define i64 @test_vsrid_n_s64(i64 %a, i64 %b) {
-; CHECK: test_vsrid_n_s64
-; CHECK: sri {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
- %vsri = insertelement <1 x i64> undef, i64 %a, i32 0
- %vsri1 = insertelement <1 x i64> undef, i64 %b, i32 0
- %vsri2 = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> %vsri, <1 x i64> %vsri1, i32 63)
- %0 = extractelement <1 x i64> %vsri2, i32 0
- ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64>, <1 x i64>, i32)
-
-define i64 @test_vsrid_n_u64(i64 %a, i64 %b) {
-; CHECK: test_vsrid_n_u64
-; CHECK: sri {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
- %vsri = insertelement <1 x i64> undef, i64 %a, i32 0
- %vsri1 = insertelement <1 x i64> undef, i64 %b, i32 0
- %vsri2 = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> %vsri, <1 x i64> %vsri1, i32 63)
- %0 = extractelement <1 x i64> %vsri2, i32 0
- ret i64 %0
-}
-
-define i64 @test_vslid_n_s64(i64 %a, i64 %b) {
-; CHECK: test_vslid_n_s64
-; CHECK: sli {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
- %vsli = insertelement <1 x i64> undef, i64 %a, i32 0
- %vsli1 = insertelement <1 x i64> undef, i64 %b, i32 0
- %vsli2 = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> %vsli, <1 x i64> %vsli1, i32 63)
- %0 = extractelement <1 x i64> %vsli2, i32 0
- ret i64 %0
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64>, <1 x i64>, i32)
-
-define i64 @test_vslid_n_u64(i64 %a, i64 %b) {
-; CHECK: test_vslid_n_u64
-; CHECK: sli {{d[0-9]+}}, {{d[0-9]+}}, #63
-entry:
- %vsli = insertelement <1 x i64> undef, i64 %a, i32 0
- %vsli1 = insertelement <1 x i64> undef, i64 %b, i32 0
- %vsli2 = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> %vsli, <1 x i64> %vsli1, i32 63)
- %0 = extractelement <1 x i64> %vsli2, i32 0
- ret i64 %0
-}
-
-define i8 @test_vqshrnh_n_s16(i16 %a) {
-; CHECK: test_vqshrnh_n_s16
-; CHECK: sqshrn {{b[0-9]+}}, {{h[0-9]+}}, #8
-entry:
- %vsqshrn = insertelement <1 x i16> undef, i16 %a, i32 0
- %vsqshrn1 = call <1 x i8> @llvm.aarch64.neon.vsqshrn.v1i8(<1 x i16> %vsqshrn, i32 8)
- %0 = extractelement <1 x i8> %vsqshrn1, i32 0
- ret i8 %0
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vsqshrn.v1i8(<1 x i16>, i32)
-
-define i16 @test_vqshrns_n_s32(i32 %a) {
-; CHECK: test_vqshrns_n_s32
-; CHECK: sqshrn {{h[0-9]+}}, {{s[0-9]+}}, #16
-entry:
- %vsqshrn = insertelement <1 x i32> undef, i32 %a, i32 0
- %vsqshrn1 = call <1 x i16> @llvm.aarch64.neon.vsqshrn.v1i16(<1 x i32> %vsqshrn, i32 16)
- %0 = extractelement <1 x i16> %vsqshrn1, i32 0
- ret i16 %0
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vsqshrn.v1i16(<1 x i32>, i32)
-
-define i32 @test_vqshrnd_n_s64(i64 %a) {
-; CHECK: test_vqshrnd_n_s64
-; CHECK: sqshrn {{s[0-9]+}}, {{d[0-9]+}}, #32
-entry:
- %vsqshrn = insertelement <1 x i64> undef, i64 %a, i32 0
- %vsqshrn1 = call <1 x i32> @llvm.aarch64.neon.vsqshrn.v1i32(<1 x i64> %vsqshrn, i32 32)
- %0 = extractelement <1 x i32> %vsqshrn1, i32 0
- ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vsqshrn.v1i32(<1 x i64>, i32)
-
-define i8 @test_vqshrnh_n_u16(i16 %a) {
-; CHECK: test_vqshrnh_n_u16
-; CHECK: uqshrn {{b[0-9]+}}, {{h[0-9]+}}, #8
-entry:
- %vuqshrn = insertelement <1 x i16> undef, i16 %a, i32 0
- %vuqshrn1 = call <1 x i8> @llvm.aarch64.neon.vuqshrn.v1i8(<1 x i16> %vuqshrn, i32 8)
- %0 = extractelement <1 x i8> %vuqshrn1, i32 0
- ret i8 %0
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vuqshrn.v1i8(<1 x i16>, i32)
-
-define i16 @test_vqshrns_n_u32(i32 %a) {
-; CHECK: test_vqshrns_n_u32
-; CHECK: uqshrn {{h[0-9]+}}, {{s[0-9]+}}, #16
-entry:
- %vuqshrn = insertelement <1 x i32> undef, i32 %a, i32 0
- %vuqshrn1 = call <1 x i16> @llvm.aarch64.neon.vuqshrn.v1i16(<1 x i32> %vuqshrn, i32 16)
- %0 = extractelement <1 x i16> %vuqshrn1, i32 0
- ret i16 %0
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vuqshrn.v1i16(<1 x i32>, i32)
-
-define i32 @test_vqshrnd_n_u64(i64 %a) {
-; CHECK: test_vqshrnd_n_u64
-; CHECK: uqshrn {{s[0-9]+}}, {{d[0-9]+}}, #32
-entry:
- %vuqshrn = insertelement <1 x i64> undef, i64 %a, i32 0
- %vuqshrn1 = call <1 x i32> @llvm.aarch64.neon.vuqshrn.v1i32(<1 x i64> %vuqshrn, i32 32)
- %0 = extractelement <1 x i32> %vuqshrn1, i32 0
- ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vuqshrn.v1i32(<1 x i64>, i32)
-
-define i8 @test_vqrshrnh_n_s16(i16 %a) {
-; CHECK: test_vqrshrnh_n_s16
-; CHECK: sqrshrn {{b[0-9]+}}, {{h[0-9]+}}, #8
-entry:
- %vsqrshrn = insertelement <1 x i16> undef, i16 %a, i32 0
- %vsqrshrn1 = call <1 x i8> @llvm.aarch64.neon.vsqrshrn.v1i8(<1 x i16> %vsqrshrn, i32 8)
- %0 = extractelement <1 x i8> %vsqrshrn1, i32 0
- ret i8 %0
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vsqrshrn.v1i8(<1 x i16>, i32)
-
-define i16 @test_vqrshrns_n_s32(i32 %a) {
-; CHECK: test_vqrshrns_n_s32
-; CHECK: sqrshrn {{h[0-9]+}}, {{s[0-9]+}}, #16
-entry:
- %vsqrshrn = insertelement <1 x i32> undef, i32 %a, i32 0
- %vsqrshrn1 = call <1 x i16> @llvm.aarch64.neon.vsqrshrn.v1i16(<1 x i32> %vsqrshrn, i32 16)
- %0 = extractelement <1 x i16> %vsqrshrn1, i32 0
- ret i16 %0
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vsqrshrn.v1i16(<1 x i32>, i32)
-
-define i32 @test_vqrshrnd_n_s64(i64 %a) {
-; CHECK: test_vqrshrnd_n_s64
-; CHECK: sqrshrn {{s[0-9]+}}, {{d[0-9]+}}, #32
-entry:
- %vsqrshrn = insertelement <1 x i64> undef, i64 %a, i32 0
- %vsqrshrn1 = call <1 x i32> @llvm.aarch64.neon.vsqrshrn.v1i32(<1 x i64> %vsqrshrn, i32 32)
- %0 = extractelement <1 x i32> %vsqrshrn1, i32 0
- ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vsqrshrn.v1i32(<1 x i64>, i32)
-
-define i8 @test_vqrshrnh_n_u16(i16 %a) {
-; CHECK: test_vqrshrnh_n_u16
-; CHECK: uqrshrn {{b[0-9]+}}, {{h[0-9]+}}, #8
-entry:
- %vuqrshrn = insertelement <1 x i16> undef, i16 %a, i32 0
- %vuqrshrn1 = call <1 x i8> @llvm.aarch64.neon.vuqrshrn.v1i8(<1 x i16> %vuqrshrn, i32 8)
- %0 = extractelement <1 x i8> %vuqrshrn1, i32 0
- ret i8 %0
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vuqrshrn.v1i8(<1 x i16>, i32)
-
-define i16 @test_vqrshrns_n_u32(i32 %a) {
-; CHECK: test_vqrshrns_n_u32
-; CHECK: uqrshrn {{h[0-9]+}}, {{s[0-9]+}}, #16
-entry:
- %vuqrshrn = insertelement <1 x i32> undef, i32 %a, i32 0
- %vuqrshrn1 = call <1 x i16> @llvm.aarch64.neon.vuqrshrn.v1i16(<1 x i32> %vuqrshrn, i32 16)
- %0 = extractelement <1 x i16> %vuqrshrn1, i32 0
- ret i16 %0
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vuqrshrn.v1i16(<1 x i32>, i32)
-
-define i32 @test_vqrshrnd_n_u64(i64 %a) {
-; CHECK: test_vqrshrnd_n_u64
-; CHECK: uqrshrn {{s[0-9]+}}, {{d[0-9]+}}, #32
-entry:
- %vuqrshrn = insertelement <1 x i64> undef, i64 %a, i32 0
- %vuqrshrn1 = call <1 x i32> @llvm.aarch64.neon.vuqrshrn.v1i32(<1 x i64> %vuqrshrn, i32 32)
- %0 = extractelement <1 x i32> %vuqrshrn1, i32 0
- ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vuqrshrn.v1i32(<1 x i64>, i32)
-
-define i8 @test_vqshrunh_n_s16(i16 %a) {
-; CHECK: test_vqshrunh_n_s16
-; CHECK: sqshrun {{b[0-9]+}}, {{h[0-9]+}}, #8
-entry:
- %vsqshrun = insertelement <1 x i16> undef, i16 %a, i32 0
- %vsqshrun1 = call <1 x i8> @llvm.aarch64.neon.vsqshrun.v1i8(<1 x i16> %vsqshrun, i32 8)
- %0 = extractelement <1 x i8> %vsqshrun1, i32 0
- ret i8 %0
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vsqshrun.v1i8(<1 x i16>, i32)
-
-define i16 @test_vqshruns_n_s32(i32 %a) {
-; CHECK: test_vqshruns_n_s32
-; CHECK: sqshrun {{h[0-9]+}}, {{s[0-9]+}}, #16
-entry:
- %vsqshrun = insertelement <1 x i32> undef, i32 %a, i32 0
- %vsqshrun1 = call <1 x i16> @llvm.aarch64.neon.vsqshrun.v1i16(<1 x i32> %vsqshrun, i32 16)
- %0 = extractelement <1 x i16> %vsqshrun1, i32 0
- ret i16 %0
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vsqshrun.v1i16(<1 x i32>, i32)
-
-define i32 @test_vqshrund_n_s64(i64 %a) {
-; CHECK: test_vqshrund_n_s64
-; CHECK: sqshrun {{s[0-9]+}}, {{d[0-9]+}}, #32
-entry:
- %vsqshrun = insertelement <1 x i64> undef, i64 %a, i32 0
- %vsqshrun1 = call <1 x i32> @llvm.aarch64.neon.vsqshrun.v1i32(<1 x i64> %vsqshrun, i32 32)
- %0 = extractelement <1 x i32> %vsqshrun1, i32 0
- ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vsqshrun.v1i32(<1 x i64>, i32)
-
-define i8 @test_vqrshrunh_n_s16(i16 %a) {
-; CHECK: test_vqrshrunh_n_s16
-; CHECK: sqrshrun {{b[0-9]+}}, {{h[0-9]+}}, #8
-entry:
- %vsqrshrun = insertelement <1 x i16> undef, i16 %a, i32 0
- %vsqrshrun1 = call <1 x i8> @llvm.aarch64.neon.vsqrshrun.v1i8(<1 x i16> %vsqrshrun, i32 8)
- %0 = extractelement <1 x i8> %vsqrshrun1, i32 0
- ret i8 %0
-}
-
-declare <1 x i8> @llvm.aarch64.neon.vsqrshrun.v1i8(<1 x i16>, i32)
-
-define i16 @test_vqrshruns_n_s32(i32 %a) {
-; CHECK: test_vqrshruns_n_s32
-; CHECK: sqrshrun {{h[0-9]+}}, {{s[0-9]+}}, #16
-entry:
- %vsqrshrun = insertelement <1 x i32> undef, i32 %a, i32 0
- %vsqrshrun1 = call <1 x i16> @llvm.aarch64.neon.vsqrshrun.v1i16(<1 x i32> %vsqrshrun, i32 16)
- %0 = extractelement <1 x i16> %vsqrshrun1, i32 0
- ret i16 %0
-}
-
-declare <1 x i16> @llvm.aarch64.neon.vsqrshrun.v1i16(<1 x i32>, i32)
-
-define i32 @test_vqrshrund_n_s64(i64 %a) {
-; CHECK: test_vqrshrund_n_s64
-; CHECK: sqrshrun {{s[0-9]+}}, {{d[0-9]+}}, #32
-entry:
- %vsqrshrun = insertelement <1 x i64> undef, i64 %a, i32 0
- %vsqrshrun1 = call <1 x i32> @llvm.aarch64.neon.vsqrshrun.v1i32(<1 x i64> %vsqrshrun, i32 32)
- %0 = extractelement <1 x i32> %vsqrshrun1, i32 0
- ret i32 %0
-}
-
-declare <1 x i32> @llvm.aarch64.neon.vsqrshrun.v1i32(<1 x i64>, i32)
diff --git a/test/CodeGen/AArch64/neon-scalar-shift.ll b/test/CodeGen/AArch64/neon-scalar-shift.ll
deleted file mode 100644
index b712ea4..0000000
--- a/test/CodeGen/AArch64/neon-scalar-shift.ll
+++ /dev/null
@@ -1,236 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-declare <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_ushl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_ushl_v1i64:
- %tmp1 = call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: ushl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-
- ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_sshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_sshl_v1i64:
- %tmp1 = call <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: sshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
- ret <1 x i64> %tmp1
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vshldu(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.aarch64.neon.vshlds(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_ushl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_ushl_v1i64_aarch64:
- %tmp1 = call <1 x i64> @llvm.aarch64.neon.vshldu(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: ushl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
- ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_sshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_sshl_v1i64_aarch64:
- %tmp1 = call <1 x i64> @llvm.aarch64.neon.vshlds(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: sshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
- ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_vtst_s64(<1 x i64> %a, <1 x i64> %b) {
-; CHECK-LABEL: test_vtst_s64
-; CHECK: cmtst {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-entry:
- %0 = and <1 x i64> %a, %b
- %1 = icmp ne <1 x i64> %0, zeroinitializer
- %vtst.i = sext <1 x i1> %1 to <1 x i64>
- ret <1 x i64> %vtst.i
-}
-
-define <1 x i64> @test_vtst_u64(<1 x i64> %a, <1 x i64> %b) {
-; CHECK-LABEL: test_vtst_u64
-; CHECK: cmtst {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-entry:
- %0 = and <1 x i64> %a, %b
- %1 = icmp ne <1 x i64> %0, zeroinitializer
- %vtst.i = sext <1 x i1> %1 to <1 x i64>
- ret <1 x i64> %vtst.i
-}
-
-define <1 x i64> @test_vsli_n_p64(<1 x i64> %a, <1 x i64> %b) {
-; CHECK-LABEL: test_vsli_n_p64
-; CHECK: sli {{d[0-9]+}}, {{d[0-9]+}}, #0
-entry:
- %vsli_n2 = tail call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> %a, <1 x i64> %b, i32 0)
- ret <1 x i64> %vsli_n2
-}
-
-declare <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64>, <1 x i64>, i32)
-
-define <2 x i64> @test_vsliq_n_p64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vsliq_n_p64
-; CHECK: sli {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
-entry:
- %vsli_n2 = tail call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> %a, <2 x i64> %b, i32 0)
- ret <2 x i64> %vsli_n2
-}
-
-declare <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64>, <2 x i64>, i32)
-
-define <2 x i32> @test_vrsqrte_u32(<2 x i32> %a) {
-; CHECK-LABEL: test_vrsqrte_u32
-; CHECK: ursqrte {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-entry:
- %vrsqrte1.i = tail call <2 x i32> @llvm.arm.neon.vrsqrte.v2i32(<2 x i32> %a)
- ret <2 x i32> %vrsqrte1.i
-}
-
-define <4 x i32> @test_vrsqrteq_u32(<4 x i32> %a) {
-; CHECK-LABEL: test_vrsqrteq_u32
-; CHECK: ursqrte {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-entry:
- %vrsqrte1.i = tail call <4 x i32> @llvm.arm.neon.vrsqrte.v4i32(<4 x i32> %a)
- ret <4 x i32> %vrsqrte1.i
-}
-
-define <8 x i8> @test_vqshl_n_s8(<8 x i8> %a) {
-; CHECK-LABEL: test_vqshl_n_s8
-; CHECK: sqshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
-entry:
- %vqshl_n = tail call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %a, <8 x i8> zeroinitializer)
- ret <8 x i8> %vqshl_n
-}
-
-declare <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8>, <8 x i8>)
-
-define <16 x i8> @test_vqshlq_n_s8(<16 x i8> %a) {
-; CHECK-LABEL: test_vqshlq_n_s8
-; CHECK: sqshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
-entry:
- %vqshl_n = tail call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %a, <16 x i8> zeroinitializer)
- ret <16 x i8> %vqshl_n
-}
-
-declare <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8>, <16 x i8>)
-
-define <4 x i16> @test_vqshl_n_s16(<4 x i16> %a) {
-; CHECK-LABEL: test_vqshl_n_s16
-; CHECK: sqshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
-entry:
- %vqshl_n1 = tail call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> %a, <4 x i16> zeroinitializer)
- ret <4 x i16> %vqshl_n1
-}
-
-declare <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16>, <4 x i16>)
-
-define <8 x i16> @test_vqshlq_n_s16(<8 x i16> %a) {
-; CHECK-LABEL: test_vqshlq_n_s16
-; CHECK: sqshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
-entry:
- %vqshl_n1 = tail call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> %a, <8 x i16> zeroinitializer)
- ret <8 x i16> %vqshl_n1
-}
-
-declare <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16>, <8 x i16>)
-
-define <2 x i32> @test_vqshl_n_s32(<2 x i32> %a) {
-; CHECK-LABEL: test_vqshl_n_s32
-; CHECK: sqshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
-entry:
- %vqshl_n1 = tail call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> %a, <2 x i32> zeroinitializer)
- ret <2 x i32> %vqshl_n1
-}
-
-declare <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32>, <2 x i32>)
-
-define <4 x i32> @test_vqshlq_n_s32(<4 x i32> %a) {
-; CHECK-LABEL: test_vqshlq_n_s32
-; CHECK: sqshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
-entry:
- %vqshl_n1 = tail call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> %a, <4 x i32> zeroinitializer)
- ret <4 x i32> %vqshl_n1
-}
-
-declare <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32>, <4 x i32>)
-
-define <2 x i64> @test_vqshlq_n_s64(<2 x i64> %a) {
-; CHECK-LABEL: test_vqshlq_n_s64
-; CHECK: sqshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
-entry:
- %vqshl_n1 = tail call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> %a, <2 x i64> zeroinitializer)
- ret <2 x i64> %vqshl_n1
-}
-
-declare <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64>, <2 x i64>)
-
-define <8 x i8> @test_vqshl_n_u8(<8 x i8> %a) {
-; CHECK-LABEL: test_vqshl_n_u8
-; CHECK: uqshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
-entry:
- %vqshl_n = tail call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %a, <8 x i8> zeroinitializer)
- ret <8 x i8> %vqshl_n
-}
-
-declare <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8>, <8 x i8>)
-
-define <16 x i8> @test_vqshlq_n_u8(<16 x i8> %a) {
-; CHECK-LABEL: test_vqshlq_n_u8
-; CHECK: uqshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
-entry:
- %vqshl_n = tail call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %a, <16 x i8> zeroinitializer)
- ret <16 x i8> %vqshl_n
-}
-
-declare <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8>, <16 x i8>)
-
-define <4 x i16> @test_vqshl_n_u16(<4 x i16> %a) {
-; CHECK-LABEL: test_vqshl_n_u16
-; CHECK: uqshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
-entry:
- %vqshl_n1 = tail call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> %a, <4 x i16> zeroinitializer)
- ret <4 x i16> %vqshl_n1
-}
-
-declare <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16>, <4 x i16>)
-
-define <8 x i16> @test_vqshlq_n_u16(<8 x i16> %a) {
-; CHECK-LABEL: test_vqshlq_n_u16
-; CHECK: uqshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
-entry:
- %vqshl_n1 = tail call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> %a, <8 x i16> zeroinitializer)
- ret <8 x i16> %vqshl_n1
-}
-
-declare <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16>, <8 x i16>)
-
-define <2 x i32> @test_vqshl_n_u32(<2 x i32> %a) {
-; CHECK-LABEL: test_vqshl_n_u32
-; CHECK: uqshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
-entry:
- %vqshl_n1 = tail call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> %a, <2 x i32> zeroinitializer)
- ret <2 x i32> %vqshl_n1
-}
-
-declare <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32>, <2 x i32>)
-
-define <4 x i32> @test_vqshlq_n_u32(<4 x i32> %a) {
-; CHECK-LABEL: test_vqshlq_n_u32
-; CHECK: uqshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
-entry:
- %vqshl_n1 = tail call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> %a, <4 x i32> zeroinitializer)
- ret <4 x i32> %vqshl_n1
-}
-
-declare <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32>, <4 x i32>)
-
-define <2 x i64> @test_vqshlq_n_u64(<2 x i64> %a) {
-; CHECK-LABEL: test_vqshlq_n_u64
-; CHECK: uqshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d,
-entry:
- %vqshl_n1 = tail call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> %a, <2 x i64> zeroinitializer)
- ret <2 x i64> %vqshl_n1
-}
-
-declare <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64>, <2 x i64>)
-
-declare <4 x i32> @llvm.arm.neon.vrsqrte.v4i32(<4 x i32>)
-
-declare <2 x i32> @llvm.arm.neon.vrsqrte.v2i32(<2 x i32>)
diff --git a/test/CodeGen/AArch64/neon-shift.ll b/test/CodeGen/AArch64/neon-shift.ll
deleted file mode 100644
index 33b04ce..0000000
--- a/test/CodeGen/AArch64/neon-shift.ll
+++ /dev/null
@@ -1,171 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-declare <8 x i8> @llvm.arm.neon.vshiftu.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_uqshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_uqshl_v8i8:
- %tmp1 = call <8 x i8> @llvm.arm.neon.vshiftu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: ushl v0.8b, v0.8b, v1.8b
- ret <8 x i8> %tmp1
-}
-
-define <8 x i8> @test_sqshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_sqshl_v8i8:
- %tmp1 = call <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: sshl v0.8b, v0.8b, v1.8b
- ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_ushl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_ushl_v16i8:
- %tmp1 = call <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: ushl v0.16b, v0.16b, v1.16b
- ret <16 x i8> %tmp1
-}
-
-define <16 x i8> @test_sshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_sshl_v16i8:
- %tmp1 = call <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: sshl v0.16b, v0.16b, v1.16b
- ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_ushl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_ushl_v4i16:
- %tmp1 = call <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: ushl v0.4h, v0.4h, v1.4h
- ret <4 x i16> %tmp1
-}
-
-define <4 x i16> @test_sshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_sshl_v4i16:
- %tmp1 = call <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: sshl v0.4h, v0.4h, v1.4h
- ret <4 x i16> %tmp1
-}
-
-declare <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_ushl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_ushl_v8i16:
- %tmp1 = call <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: ushl v0.8h, v0.8h, v1.8h
- ret <8 x i16> %tmp1
-}
-
-define <8 x i16> @test_sshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_sshl_v8i16:
- %tmp1 = call <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: sshl v0.8h, v0.8h, v1.8h
- ret <8 x i16> %tmp1
-}
-
-declare <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_ushl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_ushl_v2i32:
- %tmp1 = call <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: ushl v0.2s, v0.2s, v1.2s
- ret <2 x i32> %tmp1
-}
-
-define <2 x i32> @test_sshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_sshl_v2i32:
- %tmp1 = call <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: sshl v0.2s, v0.2s, v1.2s
- ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_ushl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_ushl_v4i32:
- %tmp1 = call <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: ushl v0.4s, v0.4s, v1.4s
- ret <4 x i32> %tmp1
-}
-
-define <4 x i32> @test_sshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_sshl_v4i32:
- %tmp1 = call <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: sshl v0.4s, v0.4s, v1.4s
- ret <4 x i32> %tmp1
-}
-
-declare <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64>, <2 x i64>)
-declare <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64>, <2 x i64>)
-
-define <2 x i64> @test_ushl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_ushl_v2i64:
- %tmp1 = call <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: ushl v0.2d, v0.2d, v1.2d
- ret <2 x i64> %tmp1
-}
-
-define <2 x i64> @test_sshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_sshl_v2i64:
- %tmp1 = call <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: sshl v0.2d, v0.2d, v1.2d
- ret <2 x i64> %tmp1
-}
-
-
-define <8 x i8> @test_shl_v8i8(<8 x i8> %a) {
-; CHECK: test_shl_v8i8:
-; CHECK: shl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
- %tmp = shl <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
- ret <8 x i8> %tmp
-}
-
-define <4 x i16> @test_shl_v4i16(<4 x i16> %a) {
-; CHECK: test_shl_v4i16:
-; CHECK: shl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
- %tmp = shl <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3>
- ret <4 x i16> %tmp
-}
-
-define <2 x i32> @test_shl_v2i32(<2 x i32> %a) {
-; CHECK: test_shl_v2i32:
-; CHECK: shl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
- %tmp = shl <2 x i32> %a, <i32 3, i32 3>
- ret <2 x i32> %tmp
-}
-
-define <16 x i8> @test_shl_v16i8(<16 x i8> %a) {
-; CHECK: test_shl_v16i8:
-; CHECK: shl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
- %tmp = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
- ret <16 x i8> %tmp
-}
-
-define <8 x i16> @test_shl_v8i16(<8 x i16> %a) {
-; CHECK: test_shl_v8i16:
-; CHECK: shl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
- %tmp = shl <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
- ret <8 x i16> %tmp
-}
-
-define <4 x i32> @test_shl_v4i32(<4 x i32> %a) {
-; CHECK: test_shl_v4i32:
-; CHECK: shl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
- %tmp = shl <4 x i32> %a, <i32 3, i32 3, i32 3, i32 3>
- ret <4 x i32> %tmp
-}
-
-define <2 x i64> @test_shl_v2i64(<2 x i64> %a) {
-; CHECK: test_shl_v2i64:
-; CHECK: shl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #63
- %tmp = shl <2 x i64> %a, <i64 63, i64 63>
- ret <2 x i64> %tmp
-}
-
diff --git a/test/CodeGen/AArch64/neon-shl-ashr-lshr.ll b/test/CodeGen/AArch64/neon-shl-ashr-lshr.ll
deleted file mode 100644
index 0b520d7..0000000
--- a/test/CodeGen/AArch64/neon-shl-ashr-lshr.ll
+++ /dev/null
@@ -1,333 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-define <8 x i8> @shl.v8i8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK-LABEL: shl.v8i8:
-; CHECK: ushl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
- %c = shl <8 x i8> %a, %b
- ret <8 x i8> %c
-}
-
-define <4 x i16> @shl.v4i16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK-LABEL: shl.v4i16:
-; CHECK: ushl v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
- %c = shl <4 x i16> %a, %b
- ret <4 x i16> %c
-}
-
-define <2 x i32> @shl.v2i32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK-LABEL: shl.v2i32:
-; CHECK: ushl v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
- %c = shl <2 x i32> %a, %b
- ret <2 x i32> %c
-}
-
-define <1 x i64> @shl.v1i64(<1 x i64> %a, <1 x i64> %b) {
-; CHECK-LABEL: shl.v1i64:
-; CHECK: ushl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
- %c = shl <1 x i64> %a, %b
- ret <1 x i64> %c
-}
-
-define <16 x i8> @shl.v16i8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: shl.v16i8:
-; CHECK: ushl v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
- %c = shl <16 x i8> %a, %b
- ret <16 x i8> %c
-}
-
-define <8 x i16> @shl.v8i16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: shl.v8i16:
-; CHECK: ushl v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
- %c = shl <8 x i16> %a, %b
- ret <8 x i16> %c
-}
-
-define <4 x i32> @shl.v4i32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: shl.v4i32:
-; CHECK: ushl v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
- %c = shl <4 x i32> %a, %b
- ret <4 x i32> %c
-}
-
-define <2 x i64> @shl.v2i64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: shl.v2i64:
-; CHECK: ushl v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
- %c = shl <2 x i64> %a, %b
- ret <2 x i64> %c
-}
-
-define <8 x i8> @lshr.v8i8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK-LABEL: lshr.v8i8:
-; CHECK: neg v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-; CHECK: ushl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
- %c = lshr <8 x i8> %a, %b
- ret <8 x i8> %c
-}
-
-define <4 x i16> @lshr.v4i16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK-LABEL: lshr.v4i16:
-; CHECK: neg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-; CHECK: ushl v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
- %c = lshr <4 x i16> %a, %b
- ret <4 x i16> %c
-}
-
-define <2 x i32> @lshr.v2i32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK-LABEL: lshr.v2i32:
-; CHECK: neg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-; CHECK: ushl v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
- %c = lshr <2 x i32> %a, %b
- ret <2 x i32> %c
-}
-
-define <1 x i64> @lshr.v1i64(<1 x i64> %a, <1 x i64> %b) {
-; CHECK-LABEL: lshr.v1i64:
-; CHECK: neg d{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: ushl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
- %c = lshr <1 x i64> %a, %b
- ret <1 x i64> %c
-}
-
-define <16 x i8> @lshr.v16i8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: lshr.v16i8:
-; CHECK: neg v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: ushl v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
- %c = lshr <16 x i8> %a, %b
- ret <16 x i8> %c
-}
-
-define <8 x i16> @lshr.v8i16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: lshr.v8i16:
-; CHECK: neg v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
-; CHECK: ushl v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
- %c = lshr <8 x i16> %a, %b
- ret <8 x i16> %c
-}
-
-define <4 x i32> @lshr.v4i32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: lshr.v4i32:
-; CHECK: neg v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-; CHECK: ushl v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
- %c = lshr <4 x i32> %a, %b
- ret <4 x i32> %c
-}
-
-define <2 x i64> @lshr.v2i64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: lshr.v2i64:
-; CHECK: neg v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-; CHECK: ushl v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
- %c = lshr <2 x i64> %a, %b
- ret <2 x i64> %c
-}
-
-define <8 x i8> @ashr.v8i8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK-LABEL: ashr.v8i8:
-; CHECK: neg v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-; CHECK: sshl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
- %c = ashr <8 x i8> %a, %b
- ret <8 x i8> %c
-}
-
-define <4 x i16> @ashr.v4i16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK-LABEL: ashr.v4i16:
-; CHECK: neg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-; CHECK: sshl v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
- %c = ashr <4 x i16> %a, %b
- ret <4 x i16> %c
-}
-
-define <2 x i32> @ashr.v2i32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK-LABEL: ashr.v2i32:
-; CHECK: neg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-; CHECK: sshl v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
- %c = ashr <2 x i32> %a, %b
- ret <2 x i32> %c
-}
-
-define <1 x i64> @ashr.v1i64(<1 x i64> %a, <1 x i64> %b) {
-; CHECK-LABEL: ashr.v1i64:
-; CHECK: neg d{{[0-9]+}}, d{{[0-9]+}}
-; CHECK: sshl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
- %c = ashr <1 x i64> %a, %b
- ret <1 x i64> %c
-}
-
-define <16 x i8> @ashr.v16i8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: ashr.v16i8:
-; CHECK: neg v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: sshl v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
- %c = ashr <16 x i8> %a, %b
- ret <16 x i8> %c
-}
-
-define <8 x i16> @ashr.v8i16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: ashr.v8i16:
-; CHECK: neg v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
-; CHECK: sshl v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
- %c = ashr <8 x i16> %a, %b
- ret <8 x i16> %c
-}
-
-define <4 x i32> @ashr.v4i32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: ashr.v4i32:
-; CHECK: neg v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
-; CHECK: sshl v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
- %c = ashr <4 x i32> %a, %b
- ret <4 x i32> %c
-}
-
-define <2 x i64> @ashr.v2i64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: ashr.v2i64:
-; CHECK: neg v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
-; CHECK: sshl v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
- %c = ashr <2 x i64> %a, %b
- ret <2 x i64> %c
-}
-
-define <1 x i64> @shl.v1i64.0(<1 x i64> %a) {
-; CHECK-LABEL: shl.v1i64.0:
-; CHECK-NOT: shl d{{[0-9]+}}, d{{[0-9]+}}, #0
- %c = shl <1 x i64> %a, zeroinitializer
- ret <1 x i64> %c
-}
-
-define <2 x i32> @shl.v2i32.0(<2 x i32> %a) {
-; CHECK-LABEL: shl.v2i32.0:
-; CHECK-NOT: shl v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, #0
- %c = shl <2 x i32> %a, zeroinitializer
- ret <2 x i32> %c
-}
-
-; The following test cases test shl/ashr/lshr with v1i8/v1i16/v1i32 types
-
-define <1 x i8> @shl.v1i8(<1 x i8> %a, <1 x i8> %b) {
-; CHECK-LABEL: shl.v1i8:
-; CHECK: ushl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
- %c = shl <1 x i8> %a, %b
- ret <1 x i8> %c
-}
-
-define <1 x i16> @shl.v1i16(<1 x i16> %a, <1 x i16> %b) {
-; CHECK-LABEL: shl.v1i16:
-; CHECK: ushl v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
- %c = shl <1 x i16> %a, %b
- ret <1 x i16> %c
-}
-
-define <1 x i32> @shl.v1i32(<1 x i32> %a, <1 x i32> %b) {
-; CHECK-LABEL: shl.v1i32:
-; CHECK: ushl v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
- %c = shl <1 x i32> %a, %b
- ret <1 x i32> %c
-}
-
-define <1 x i8> @ashr.v1i8(<1 x i8> %a, <1 x i8> %b) {
-; CHECK-LABEL: ashr.v1i8:
-; CHECK: neg v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-; CHECK: sshl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
- %c = ashr <1 x i8> %a, %b
- ret <1 x i8> %c
-}
-
-define <1 x i16> @ashr.v1i16(<1 x i16> %a, <1 x i16> %b) {
-; CHECK-LABEL: ashr.v1i16:
-; CHECK: neg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-; CHECK: sshl v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
- %c = ashr <1 x i16> %a, %b
- ret <1 x i16> %c
-}
-
-define <1 x i32> @ashr.v1i32(<1 x i32> %a, <1 x i32> %b) {
-; CHECK-LABEL: ashr.v1i32:
-; CHECK: neg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-; CHECK: sshl v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
- %c = ashr <1 x i32> %a, %b
- ret <1 x i32> %c
-}
-
-define <1 x i8> @lshr.v1i8(<1 x i8> %a, <1 x i8> %b) {
-; CHECK-LABEL: lshr.v1i8:
-; CHECK: neg v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
-; CHECK: ushl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
- %c = lshr <1 x i8> %a, %b
- ret <1 x i8> %c
-}
-
-define <1 x i16> @lshr.v1i16(<1 x i16> %a, <1 x i16> %b) {
-; CHECK-LABEL: lshr.v1i16:
-; CHECK: neg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
-; CHECK: ushl v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
- %c = lshr <1 x i16> %a, %b
- ret <1 x i16> %c
-}
-
-define <1 x i32> @lshr.v1i32(<1 x i32> %a, <1 x i32> %b) {
-; CHECK-LABEL: lshr.v1i32:
-; CHECK: neg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
-; CHECK: ushl v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
- %c = lshr <1 x i32> %a, %b
- ret <1 x i32> %c
-}
-
-define <1 x i8> @shl.v1i8.imm(<1 x i8> %a) {
-; CHECK-LABEL: shl.v1i8.imm:
-; CHECK: shl v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, #3
- %c = shl <1 x i8> %a, <i8 3>
- ret <1 x i8> %c
-}
-
-define <1 x i16> @shl.v1i16.imm(<1 x i16> %a) {
-; CHECK-LABEL: shl.v1i16.imm:
-; CHECK: shl v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, #5
- %c = shl <1 x i16> %a, <i16 5>
- ret <1 x i16> %c
-}
-
-define <1 x i32> @shl.v1i32.imm(<1 x i32> %a) {
-; CHECK-LABEL: shl.v1i32.imm:
-; CHECK-NOT: shl v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, #0
- %c = shl <1 x i32> %a, zeroinitializer
- ret <1 x i32> %c
-}
-
-define <1 x i8> @ashr.v1i8.imm(<1 x i8> %a) {
-; CHECK-LABEL: ashr.v1i8.imm:
-; CHECK: sshr v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, #3
- %c = ashr <1 x i8> %a, <i8 3>
- ret <1 x i8> %c
-}
-
-define <1 x i16> @ashr.v1i16.imm(<1 x i16> %a) {
-; CHECK-LABEL: ashr.v1i16.imm:
-; CHECK: sshr v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, #10
- %c = ashr <1 x i16> %a, <i16 10>
- ret <1 x i16> %c
-}
-
-define <1 x i32> @ashr.v1i32.imm(<1 x i32> %a) {
-; CHECK-LABEL: ashr.v1i32.imm:
-; CHECK: sshr v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, #31
- %c = ashr <1 x i32> %a, <i32 31>
- ret <1 x i32> %c
-}
-
-define <1 x i8> @lshr.v1i8.imm(<1 x i8> %a) {
-; CHECK-LABEL: lshr.v1i8.imm:
-; CHECK: ushr v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, #3
- %c = lshr <1 x i8> %a, <i8 3>
- ret <1 x i8> %c
-}
-
-define <1 x i16> @lshr.v1i16.imm(<1 x i16> %a) {
-; CHECK-LABEL: lshr.v1i16.imm:
-; CHECK: ushr v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, #10
- %c = lshr <1 x i16> %a, <i16 10>
- ret <1 x i16> %c
-}
-
-define <1 x i32> @lshr.v1i32.imm(<1 x i32> %a) {
-; CHECK-LABEL: lshr.v1i32.imm:
-; CHECK: ushr v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, #31
- %c = lshr <1 x i32> %a, <i32 31>
- ret <1 x i32> %c
-}
diff --git a/test/CodeGen/AArch64/neon-simd-ldst-multi-elem.ll b/test/CodeGen/AArch64/neon-simd-ldst-multi-elem.ll
deleted file mode 100644
index d5557c0..0000000
--- a/test/CodeGen/AArch64/neon-simd-ldst-multi-elem.ll
+++ /dev/null
@@ -1,2314 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-define void @test_ldst1_v16i8(<16 x i8>* %ptr, <16 x i8>* %ptr2) {
-; CHECK-LABEL: test_ldst1_v16i8:
-; CHECK: ld1 {v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
- %tmp = load <16 x i8>* %ptr
- store <16 x i8> %tmp, <16 x i8>* %ptr2
- ret void
-}
-
-define void @test_ldst1_v8i16(<8 x i16>* %ptr, <8 x i16>* %ptr2) {
-; CHECK-LABEL: test_ldst1_v8i16:
-; CHECK: ld1 {v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
- %tmp = load <8 x i16>* %ptr
- store <8 x i16> %tmp, <8 x i16>* %ptr2
- ret void
-}
-
-define void @test_ldst1_v4i32(<4 x i32>* %ptr, <4 x i32>* %ptr2) {
-; CHECK-LABEL: test_ldst1_v4i32:
-; CHECK: ld1 {v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
- %tmp = load <4 x i32>* %ptr
- store <4 x i32> %tmp, <4 x i32>* %ptr2
- ret void
-}
-
-define void @test_ldst1_v2i64(<2 x i64>* %ptr, <2 x i64>* %ptr2) {
-; CHECK-LABEL: test_ldst1_v2i64:
-; CHECK: ld1 {v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
- %tmp = load <2 x i64>* %ptr
- store <2 x i64> %tmp, <2 x i64>* %ptr2
- ret void
-}
-
-define void @test_ldst1_v8i8(<8 x i8>* %ptr, <8 x i8>* %ptr2) {
-; CHECK-LABEL: test_ldst1_v8i8:
-; CHECK: ld1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
- %tmp = load <8 x i8>* %ptr
- store <8 x i8> %tmp, <8 x i8>* %ptr2
- ret void
-}
-
-define void @test_ldst1_v4i16(<4 x i16>* %ptr, <4 x i16>* %ptr2) {
-; CHECK-LABEL: test_ldst1_v4i16:
-; CHECK: ld1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
- %tmp = load <4 x i16>* %ptr
- store <4 x i16> %tmp, <4 x i16>* %ptr2
- ret void
-}
-
-define void @test_ldst1_v2i32(<2 x i32>* %ptr, <2 x i32>* %ptr2) {
-; CHECK-LABEL: test_ldst1_v2i32:
-; CHECK: ld1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
- %tmp = load <2 x i32>* %ptr
- store <2 x i32> %tmp, <2 x i32>* %ptr2
- ret void
-}
-
-define void @test_ldst1_v1i64(<1 x i64>* %ptr, <1 x i64>* %ptr2) {
-; CHECK-LABEL: test_ldst1_v1i64:
-; CHECK: ld1 {v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
- %tmp = load <1 x i64>* %ptr
- store <1 x i64> %tmp, <1 x i64>* %ptr2
- ret void
-}
-
-%struct.int8x16x2_t = type { [2 x <16 x i8>] }
-%struct.int16x8x2_t = type { [2 x <8 x i16>] }
-%struct.int32x4x2_t = type { [2 x <4 x i32>] }
-%struct.int64x2x2_t = type { [2 x <2 x i64>] }
-%struct.float32x4x2_t = type { [2 x <4 x float>] }
-%struct.float64x2x2_t = type { [2 x <2 x double>] }
-%struct.int8x8x2_t = type { [2 x <8 x i8>] }
-%struct.int16x4x2_t = type { [2 x <4 x i16>] }
-%struct.int32x2x2_t = type { [2 x <2 x i32>] }
-%struct.int64x1x2_t = type { [2 x <1 x i64>] }
-%struct.float32x2x2_t = type { [2 x <2 x float>] }
-%struct.float64x1x2_t = type { [2 x <1 x double>] }
-%struct.int8x16x3_t = type { [3 x <16 x i8>] }
-%struct.int16x8x3_t = type { [3 x <8 x i16>] }
-%struct.int32x4x3_t = type { [3 x <4 x i32>] }
-%struct.int64x2x3_t = type { [3 x <2 x i64>] }
-%struct.float32x4x3_t = type { [3 x <4 x float>] }
-%struct.float64x2x3_t = type { [3 x <2 x double>] }
-%struct.int8x8x3_t = type { [3 x <8 x i8>] }
-%struct.int16x4x3_t = type { [3 x <4 x i16>] }
-%struct.int32x2x3_t = type { [3 x <2 x i32>] }
-%struct.int64x1x3_t = type { [3 x <1 x i64>] }
-%struct.float32x2x3_t = type { [3 x <2 x float>] }
-%struct.float64x1x3_t = type { [3 x <1 x double>] }
-%struct.int8x16x4_t = type { [4 x <16 x i8>] }
-%struct.int16x8x4_t = type { [4 x <8 x i16>] }
-%struct.int32x4x4_t = type { [4 x <4 x i32>] }
-%struct.int64x2x4_t = type { [4 x <2 x i64>] }
-%struct.float32x4x4_t = type { [4 x <4 x float>] }
-%struct.float64x2x4_t = type { [4 x <2 x double>] }
-%struct.int8x8x4_t = type { [4 x <8 x i8>] }
-%struct.int16x4x4_t = type { [4 x <4 x i16>] }
-%struct.int32x2x4_t = type { [4 x <2 x i32>] }
-%struct.int64x1x4_t = type { [4 x <1 x i64>] }
-%struct.float32x2x4_t = type { [4 x <2 x float>] }
-%struct.float64x1x4_t = type { [4 x <1 x double>] }
-
-
-define <16 x i8> @test_vld1q_s8(i8* readonly %a) {
-; CHECK-LABEL: test_vld1q_s8
-; CHECK: ld1 {v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
- %vld1 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %a, i32 1)
- ret <16 x i8> %vld1
-}
-
-define <8 x i16> @test_vld1q_s16(i16* readonly %a) {
-; CHECK-LABEL: test_vld1q_s16
-; CHECK: ld1 {v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
- %1 = bitcast i16* %a to i8*
- %vld1 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %1, i32 2)
- ret <8 x i16> %vld1
-}
-
-define <4 x i32> @test_vld1q_s32(i32* readonly %a) {
-; CHECK-LABEL: test_vld1q_s32
-; CHECK: ld1 {v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
- %1 = bitcast i32* %a to i8*
- %vld1 = tail call <4 x i32> @llvm.arm.neon.vld1.v4i32(i8* %1, i32 4)
- ret <4 x i32> %vld1
-}
-
-define <2 x i64> @test_vld1q_s64(i64* readonly %a) {
-; CHECK-LABEL: test_vld1q_s64
-; CHECK: ld1 {v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
- %1 = bitcast i64* %a to i8*
- %vld1 = tail call <2 x i64> @llvm.arm.neon.vld1.v2i64(i8* %1, i32 8)
- ret <2 x i64> %vld1
-}
-
-define <4 x float> @test_vld1q_f32(float* readonly %a) {
-; CHECK-LABEL: test_vld1q_f32
-; CHECK: ld1 {v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
- %1 = bitcast float* %a to i8*
- %vld1 = tail call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %1, i32 4)
- ret <4 x float> %vld1
-}
-
-define <2 x double> @test_vld1q_f64(double* readonly %a) {
-; CHECK-LABEL: test_vld1q_f64
-; CHECK: ld1 {v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
- %1 = bitcast double* %a to i8*
- %vld1 = tail call <2 x double> @llvm.arm.neon.vld1.v2f64(i8* %1, i32 8)
- ret <2 x double> %vld1
-}
-
-define <8 x i8> @test_vld1_s8(i8* readonly %a) {
-; CHECK-LABEL: test_vld1_s8
-; CHECK: ld1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
- %vld1 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %a, i32 1)
- ret <8 x i8> %vld1
-}
-
-define <4 x i16> @test_vld1_s16(i16* readonly %a) {
-; CHECK-LABEL: test_vld1_s16
-; CHECK: ld1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
- %1 = bitcast i16* %a to i8*
- %vld1 = tail call <4 x i16> @llvm.arm.neon.vld1.v4i16(i8* %1, i32 2)
- ret <4 x i16> %vld1
-}
-
-define <2 x i32> @test_vld1_s32(i32* readonly %a) {
-; CHECK-LABEL: test_vld1_s32
-; CHECK: ld1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
- %1 = bitcast i32* %a to i8*
- %vld1 = tail call <2 x i32> @llvm.arm.neon.vld1.v2i32(i8* %1, i32 4)
- ret <2 x i32> %vld1
-}
-
-define <1 x i64> @test_vld1_s64(i64* readonly %a) {
-; CHECK-LABEL: test_vld1_s64
-; CHECK: ld1 {v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
- %1 = bitcast i64* %a to i8*
- %vld1 = tail call <1 x i64> @llvm.arm.neon.vld1.v1i64(i8* %1, i32 8)
- ret <1 x i64> %vld1
-}
-
-define <2 x float> @test_vld1_f32(float* readonly %a) {
-; CHECK-LABEL: test_vld1_f32
-; CHECK: ld1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
- %1 = bitcast float* %a to i8*
- %vld1 = tail call <2 x float> @llvm.arm.neon.vld1.v2f32(i8* %1, i32 4)
- ret <2 x float> %vld1
-}
-
-define <1 x double> @test_vld1_f64(double* readonly %a) {
-; CHECK-LABEL: test_vld1_f64
-; CHECK: ld1 {v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
- %1 = bitcast double* %a to i8*
- %vld1 = tail call <1 x double> @llvm.arm.neon.vld1.v1f64(i8* %1, i32 8)
- ret <1 x double> %vld1
-}
-
-define <8 x i8> @test_vld1_p8(i8* readonly %a) {
-; CHECK-LABEL: test_vld1_p8
-; CHECK: ld1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
- %vld1 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %a, i32 1)
- ret <8 x i8> %vld1
-}
-
-define <4 x i16> @test_vld1_p16(i16* readonly %a) {
-; CHECK-LABEL: test_vld1_p16
-; CHECK: ld1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
- %1 = bitcast i16* %a to i8*
- %vld1 = tail call <4 x i16> @llvm.arm.neon.vld1.v4i16(i8* %1, i32 2)
- ret <4 x i16> %vld1
-}
-
-define %struct.int8x16x2_t @test_vld2q_s8(i8* readonly %a) {
-; CHECK-LABEL: test_vld2q_s8
-; CHECK: ld2 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
- %vld2 = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8* %a, i32 1)
- %vld2.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2, 0
- %vld2.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2, 1
- %.fca.0.0.insert = insertvalue %struct.int8x16x2_t undef, <16 x i8> %vld2.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int8x16x2_t %.fca.0.0.insert, <16 x i8> %vld2.fca.1.extract, 0, 1
- ret %struct.int8x16x2_t %.fca.0.1.insert
-}
-
-define %struct.int16x8x2_t @test_vld2q_s16(i16* readonly %a) {
-; CHECK-LABEL: test_vld2q_s16
-; CHECK: ld2 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
- %1 = bitcast i16* %a to i8*
- %vld2 = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16(i8* %1, i32 2)
- %vld2.fca.0.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2, 0
- %vld2.fca.1.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2, 1
- %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %vld2.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %vld2.fca.1.extract, 0, 1
- ret %struct.int16x8x2_t %.fca.0.1.insert
-}
-
-define %struct.int32x4x2_t @test_vld2q_s32(i32* readonly %a) {
-; CHECK-LABEL: test_vld2q_s32
-; CHECK: ld2 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
- %1 = bitcast i32* %a to i8*
- %vld2 = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32(i8* %1, i32 4)
- %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
- %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
- %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %vld2.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %vld2.fca.1.extract, 0, 1
- ret %struct.int32x4x2_t %.fca.0.1.insert
-}
-
-define %struct.int64x2x2_t @test_vld2q_s64(i64* readonly %a) {
-; CHECK-LABEL: test_vld2q_s64
-; CHECK: ld2 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
- %1 = bitcast i64* %a to i8*
- %vld2 = tail call { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2.v2i64(i8* %1, i32 8)
- %vld2.fca.0.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2, 0
- %vld2.fca.1.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2, 1
- %.fca.0.0.insert = insertvalue %struct.int64x2x2_t undef, <2 x i64> %vld2.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int64x2x2_t %.fca.0.0.insert, <2 x i64> %vld2.fca.1.extract, 0, 1
- ret %struct.int64x2x2_t %.fca.0.1.insert
-}
-
-define %struct.float32x4x2_t @test_vld2q_f32(float* readonly %a) {
-; CHECK-LABEL: test_vld2q_f32
-; CHECK: ld2 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
- %1 = bitcast float* %a to i8*
- %vld2 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8* %1, i32 4)
- %vld2.fca.0.extract = extractvalue { <4 x float>, <4 x float> } %vld2, 0
- %vld2.fca.1.extract = extractvalue { <4 x float>, <4 x float> } %vld2, 1
- %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %vld2.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %vld2.fca.1.extract, 0, 1
- ret %struct.float32x4x2_t %.fca.0.1.insert
-}
-
-define %struct.float64x2x2_t @test_vld2q_f64(double* readonly %a) {
-; CHECK-LABEL: test_vld2q_f64
-; CHECK: ld2 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
- %1 = bitcast double* %a to i8*
- %vld2 = tail call { <2 x double>, <2 x double> } @llvm.arm.neon.vld2.v2f64(i8* %1, i32 8)
- %vld2.fca.0.extract = extractvalue { <2 x double>, <2 x double> } %vld2, 0
- %vld2.fca.1.extract = extractvalue { <2 x double>, <2 x double> } %vld2, 1
- %.fca.0.0.insert = insertvalue %struct.float64x2x2_t undef, <2 x double> %vld2.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.float64x2x2_t %.fca.0.0.insert, <2 x double> %vld2.fca.1.extract, 0, 1
- ret %struct.float64x2x2_t %.fca.0.1.insert
-}
-
-define %struct.int8x8x2_t @test_vld2_s8(i8* readonly %a) {
-; CHECK-LABEL: test_vld2_s8
-; CHECK: ld2 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
- %vld2 = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8(i8* %a, i32 1)
- %vld2.fca.0.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2, 0
- %vld2.fca.1.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2, 1
- %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %vld2.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %vld2.fca.1.extract, 0, 1
- ret %struct.int8x8x2_t %.fca.0.1.insert
-}
-
-define %struct.int16x4x2_t @test_vld2_s16(i16* readonly %a) {
-; CHECK-LABEL: test_vld2_s16
-; CHECK: ld2 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
- %1 = bitcast i16* %a to i8*
- %vld2 = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16(i8* %1, i32 2)
- %vld2.fca.0.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2, 0
- %vld2.fca.1.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2, 1
- %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %vld2.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %vld2.fca.1.extract, 0, 1
- ret %struct.int16x4x2_t %.fca.0.1.insert
-}
-
-define %struct.int32x2x2_t @test_vld2_s32(i32* readonly %a) {
-; CHECK-LABEL: test_vld2_s32
-; CHECK: ld2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
- %1 = bitcast i32* %a to i8*
- %vld2 = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32(i8* %1, i32 4)
- %vld2.fca.0.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2, 0
- %vld2.fca.1.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2, 1
- %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %vld2.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %vld2.fca.1.extract, 0, 1
- ret %struct.int32x2x2_t %.fca.0.1.insert
-}
-
-define %struct.int64x1x2_t @test_vld2_s64(i64* readonly %a) {
-; CHECK-LABEL: test_vld2_s64
-; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
- %1 = bitcast i64* %a to i8*
- %vld2 = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8* %1, i32 8)
- %vld2.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2, 0
- %vld2.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2, 1
- %.fca.0.0.insert = insertvalue %struct.int64x1x2_t undef, <1 x i64> %vld2.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int64x1x2_t %.fca.0.0.insert, <1 x i64> %vld2.fca.1.extract, 0, 1
- ret %struct.int64x1x2_t %.fca.0.1.insert
-}
-
-define %struct.float32x2x2_t @test_vld2_f32(float* readonly %a) {
-; CHECK-LABEL: test_vld2_f32
-; CHECK: ld2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
- %1 = bitcast float* %a to i8*
- %vld2 = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2.v2f32(i8* %1, i32 4)
- %vld2.fca.0.extract = extractvalue { <2 x float>, <2 x float> } %vld2, 0
- %vld2.fca.1.extract = extractvalue { <2 x float>, <2 x float> } %vld2, 1
- %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %vld2.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %vld2.fca.1.extract, 0, 1
- ret %struct.float32x2x2_t %.fca.0.1.insert
-}
-
-define %struct.float64x1x2_t @test_vld2_f64(double* readonly %a) {
-; CHECK-LABEL: test_vld2_f64
-; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
- %1 = bitcast double* %a to i8*
- %vld2 = tail call { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8* %1, i32 8)
- %vld2.fca.0.extract = extractvalue { <1 x double>, <1 x double> } %vld2, 0
- %vld2.fca.1.extract = extractvalue { <1 x double>, <1 x double> } %vld2, 1
- %.fca.0.0.insert = insertvalue %struct.float64x1x2_t undef, <1 x double> %vld2.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.float64x1x2_t %.fca.0.0.insert, <1 x double> %vld2.fca.1.extract, 0, 1
- ret %struct.float64x1x2_t %.fca.0.1.insert
-}
-
-define %struct.int8x16x3_t @test_vld3q_s8(i8* readonly %a) {
-; CHECK-LABEL: test_vld3q_s8
-; CHECK: ld3 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
- %vld3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* %a, i32 1)
- %vld3.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 0
- %vld3.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 1
- %vld3.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 2
- %.fca.0.0.insert = insertvalue %struct.int8x16x3_t undef, <16 x i8> %vld3.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int8x16x3_t %.fca.0.0.insert, <16 x i8> %vld3.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int8x16x3_t %.fca.0.1.insert, <16 x i8> %vld3.fca.2.extract, 0, 2
- ret %struct.int8x16x3_t %.fca.0.2.insert
-}
-
-define %struct.int16x8x3_t @test_vld3q_s16(i16* readonly %a) {
-; CHECK-LABEL: test_vld3q_s16
-; CHECK: ld3 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
- %1 = bitcast i16* %a to i8*
- %vld3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16(i8* %1, i32 2)
- %vld3.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3, 0
- %vld3.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3, 1
- %vld3.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3, 2
- %.fca.0.0.insert = insertvalue %struct.int16x8x3_t undef, <8 x i16> %vld3.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int16x8x3_t %.fca.0.0.insert, <8 x i16> %vld3.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int16x8x3_t %.fca.0.1.insert, <8 x i16> %vld3.fca.2.extract, 0, 2
- ret %struct.int16x8x3_t %.fca.0.2.insert
-}
-
-define %struct.int32x4x3_t @test_vld3q_s32(i32* readonly %a) {
-; CHECK-LABEL: test_vld3q_s32
-; CHECK: ld3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
- %1 = bitcast i32* %a to i8*
- %vld3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32(i8* %1, i32 4)
- %vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0
- %vld3.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 1
- %vld3.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 2
- %.fca.0.0.insert = insertvalue %struct.int32x4x3_t undef, <4 x i32> %vld3.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int32x4x3_t %.fca.0.0.insert, <4 x i32> %vld3.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int32x4x3_t %.fca.0.1.insert, <4 x i32> %vld3.fca.2.extract, 0, 2
- ret %struct.int32x4x3_t %.fca.0.2.insert
-}
-
-define %struct.int64x2x3_t @test_vld3q_s64(i64* readonly %a) {
-; CHECK-LABEL: test_vld3q_s64
-; CHECK: ld3 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
- %1 = bitcast i64* %a to i8*
- %vld3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3.v2i64(i8* %1, i32 8)
- %vld3.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3, 0
- %vld3.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3, 1
- %vld3.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3, 2
- %.fca.0.0.insert = insertvalue %struct.int64x2x3_t undef, <2 x i64> %vld3.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int64x2x3_t %.fca.0.0.insert, <2 x i64> %vld3.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int64x2x3_t %.fca.0.1.insert, <2 x i64> %vld3.fca.2.extract, 0, 2
- ret %struct.int64x2x3_t %.fca.0.2.insert
-}
-
-define %struct.float32x4x3_t @test_vld3q_f32(float* readonly %a) {
-; CHECK-LABEL: test_vld3q_f32
-; CHECK: ld3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
- %1 = bitcast float* %a to i8*
- %vld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32(i8* %1, i32 4)
- %vld3.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3, 0
- %vld3.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3, 1
- %vld3.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3, 2
- %.fca.0.0.insert = insertvalue %struct.float32x4x3_t undef, <4 x float> %vld3.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.float32x4x3_t %.fca.0.0.insert, <4 x float> %vld3.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.float32x4x3_t %.fca.0.1.insert, <4 x float> %vld3.fca.2.extract, 0, 2
- ret %struct.float32x4x3_t %.fca.0.2.insert
-}
-
-define %struct.float64x2x3_t @test_vld3q_f64(double* readonly %a) {
-; CHECK-LABEL: test_vld3q_f64
-; CHECK: ld3 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
- %1 = bitcast double* %a to i8*
- %vld3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3.v2f64(i8* %1, i32 8)
- %vld3.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3, 0
- %vld3.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3, 1
- %vld3.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3, 2
- %.fca.0.0.insert = insertvalue %struct.float64x2x3_t undef, <2 x double> %vld3.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.float64x2x3_t %.fca.0.0.insert, <2 x double> %vld3.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.float64x2x3_t %.fca.0.1.insert, <2 x double> %vld3.fca.2.extract, 0, 2
- ret %struct.float64x2x3_t %.fca.0.2.insert
-}
-
-define %struct.int8x8x3_t @test_vld3_s8(i8* readonly %a) {
-; CHECK-LABEL: test_vld3_s8
-; CHECK: ld3 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
- %vld3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8(i8* %a, i32 1)
- %vld3.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 0
- %vld3.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 1
- %vld3.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 2
- %.fca.0.0.insert = insertvalue %struct.int8x8x3_t undef, <8 x i8> %vld3.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int8x8x3_t %.fca.0.0.insert, <8 x i8> %vld3.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int8x8x3_t %.fca.0.1.insert, <8 x i8> %vld3.fca.2.extract, 0, 2
- ret %struct.int8x8x3_t %.fca.0.2.insert
-}
-
-define %struct.int16x4x3_t @test_vld3_s16(i16* readonly %a) {
-; CHECK-LABEL: test_vld3_s16
-; CHECK: ld3 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
- %1 = bitcast i16* %a to i8*
- %vld3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8* %1, i32 2)
- %vld3.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3, 0
- %vld3.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3, 1
- %vld3.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3, 2
- %.fca.0.0.insert = insertvalue %struct.int16x4x3_t undef, <4 x i16> %vld3.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int16x4x3_t %.fca.0.0.insert, <4 x i16> %vld3.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int16x4x3_t %.fca.0.1.insert, <4 x i16> %vld3.fca.2.extract, 0, 2
- ret %struct.int16x4x3_t %.fca.0.2.insert
-}
-
-define %struct.int32x2x3_t @test_vld3_s32(i32* readonly %a) {
-; CHECK-LABEL: test_vld3_s32
-; CHECK: ld3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
- %1 = bitcast i32* %a to i8*
- %vld3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32(i8* %1, i32 4)
- %vld3.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3, 0
- %vld3.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3, 1
- %vld3.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3, 2
- %.fca.0.0.insert = insertvalue %struct.int32x2x3_t undef, <2 x i32> %vld3.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int32x2x3_t %.fca.0.0.insert, <2 x i32> %vld3.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int32x2x3_t %.fca.0.1.insert, <2 x i32> %vld3.fca.2.extract, 0, 2
- ret %struct.int32x2x3_t %.fca.0.2.insert
-}
-
-define %struct.int64x1x3_t @test_vld3_s64(i64* readonly %a) {
-; CHECK-LABEL: test_vld3_s64
-; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
- %1 = bitcast i64* %a to i8*
- %vld3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8* %1, i32 8)
- %vld3.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3, 0
- %vld3.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3, 1
- %vld3.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3, 2
- %.fca.0.0.insert = insertvalue %struct.int64x1x3_t undef, <1 x i64> %vld3.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int64x1x3_t %.fca.0.0.insert, <1 x i64> %vld3.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int64x1x3_t %.fca.0.1.insert, <1 x i64> %vld3.fca.2.extract, 0, 2
- ret %struct.int64x1x3_t %.fca.0.2.insert
-}
-
-define %struct.float32x2x3_t @test_vld3_f32(float* readonly %a) {
-; CHECK-LABEL: test_vld3_f32
-; CHECK: ld3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
- %1 = bitcast float* %a to i8*
- %vld3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3.v2f32(i8* %1, i32 4)
- %vld3.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3, 0
- %vld3.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3, 1
- %vld3.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3, 2
- %.fca.0.0.insert = insertvalue %struct.float32x2x3_t undef, <2 x float> %vld3.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.float32x2x3_t %.fca.0.0.insert, <2 x float> %vld3.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.float32x2x3_t %.fca.0.1.insert, <2 x float> %vld3.fca.2.extract, 0, 2
- ret %struct.float32x2x3_t %.fca.0.2.insert
-}
-
-define %struct.float64x1x3_t @test_vld3_f64(double* readonly %a) {
-; CHECK-LABEL: test_vld3_f64
-; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
- %1 = bitcast double* %a to i8*
- %vld3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8* %1, i32 8)
- %vld3.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3, 0
- %vld3.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3, 1
- %vld3.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3, 2
- %.fca.0.0.insert = insertvalue %struct.float64x1x3_t undef, <1 x double> %vld3.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.float64x1x3_t %.fca.0.0.insert, <1 x double> %vld3.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.float64x1x3_t %.fca.0.1.insert, <1 x double> %vld3.fca.2.extract, 0, 2
- ret %struct.float64x1x3_t %.fca.0.2.insert
-}
-
-define %struct.int8x16x4_t @test_vld4q_s8(i8* readonly %a) {
-; CHECK-LABEL: test_vld4q_s8
-; CHECK: ld4 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
- %vld4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8* %a, i32 1)
- %vld4.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 0
- %vld4.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 1
- %vld4.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 2
- %vld4.fca.3.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 3
- %.fca.0.0.insert = insertvalue %struct.int8x16x4_t undef, <16 x i8> %vld4.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int8x16x4_t %.fca.0.0.insert, <16 x i8> %vld4.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int8x16x4_t %.fca.0.1.insert, <16 x i8> %vld4.fca.2.extract, 0, 2
- %.fca.0.3.insert = insertvalue %struct.int8x16x4_t %.fca.0.2.insert, <16 x i8> %vld4.fca.3.extract, 0, 3
- ret %struct.int8x16x4_t %.fca.0.3.insert
-}
-
-define %struct.int16x8x4_t @test_vld4q_s16(i16* readonly %a) {
-; CHECK-LABEL: test_vld4q_s16
-; CHECK: ld4 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
- %1 = bitcast i16* %a to i8*
- %vld4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16(i8* %1, i32 2)
- %vld4.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4, 0
- %vld4.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4, 1
- %vld4.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4, 2
- %vld4.fca.3.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4, 3
- %.fca.0.0.insert = insertvalue %struct.int16x8x4_t undef, <8 x i16> %vld4.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int16x8x4_t %.fca.0.0.insert, <8 x i16> %vld4.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int16x8x4_t %.fca.0.1.insert, <8 x i16> %vld4.fca.2.extract, 0, 2
- %.fca.0.3.insert = insertvalue %struct.int16x8x4_t %.fca.0.2.insert, <8 x i16> %vld4.fca.3.extract, 0, 3
- ret %struct.int16x8x4_t %.fca.0.3.insert
-}
-
-define %struct.int32x4x4_t @test_vld4q_s32(i32* readonly %a) {
-; CHECK-LABEL: test_vld4q_s32
-; CHECK: ld4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
- %1 = bitcast i32* %a to i8*
- %vld4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32(i8* %1, i32 4)
- %vld4.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4, 0
- %vld4.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4, 1
- %vld4.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4, 2
- %vld4.fca.3.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4, 3
- %.fca.0.0.insert = insertvalue %struct.int32x4x4_t undef, <4 x i32> %vld4.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int32x4x4_t %.fca.0.0.insert, <4 x i32> %vld4.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int32x4x4_t %.fca.0.1.insert, <4 x i32> %vld4.fca.2.extract, 0, 2
- %.fca.0.3.insert = insertvalue %struct.int32x4x4_t %.fca.0.2.insert, <4 x i32> %vld4.fca.3.extract, 0, 3
- ret %struct.int32x4x4_t %.fca.0.3.insert
-}
-
-define %struct.int64x2x4_t @test_vld4q_s64(i64* readonly %a) {
-; CHECK-LABEL: test_vld4q_s64
-; CHECK: ld4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
- %1 = bitcast i64* %a to i8*
- %vld4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4.v2i64(i8* %1, i32 8)
- %vld4.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld4, 0
- %vld4.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld4, 1
- %vld4.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld4, 2
- %vld4.fca.3.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld4, 3
- %.fca.0.0.insert = insertvalue %struct.int64x2x4_t undef, <2 x i64> %vld4.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int64x2x4_t %.fca.0.0.insert, <2 x i64> %vld4.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int64x2x4_t %.fca.0.1.insert, <2 x i64> %vld4.fca.2.extract, 0, 2
- %.fca.0.3.insert = insertvalue %struct.int64x2x4_t %.fca.0.2.insert, <2 x i64> %vld4.fca.3.extract, 0, 3
- ret %struct.int64x2x4_t %.fca.0.3.insert
-}
-
-define %struct.float32x4x4_t @test_vld4q_f32(float* readonly %a) {
-; CHECK-LABEL: test_vld4q_f32
-; CHECK: ld4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
- %1 = bitcast float* %a to i8*
- %vld4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32(i8* %1, i32 4)
- %vld4.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4, 0
- %vld4.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4, 1
- %vld4.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4, 2
- %vld4.fca.3.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4, 3
- %.fca.0.0.insert = insertvalue %struct.float32x4x4_t undef, <4 x float> %vld4.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.float32x4x4_t %.fca.0.0.insert, <4 x float> %vld4.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.float32x4x4_t %.fca.0.1.insert, <4 x float> %vld4.fca.2.extract, 0, 2
- %.fca.0.3.insert = insertvalue %struct.float32x4x4_t %.fca.0.2.insert, <4 x float> %vld4.fca.3.extract, 0, 3
- ret %struct.float32x4x4_t %.fca.0.3.insert
-}
-
-define %struct.float64x2x4_t @test_vld4q_f64(double* readonly %a) {
-; CHECK-LABEL: test_vld4q_f64
-; CHECK: ld4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
- %1 = bitcast double* %a to i8*
- %vld4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4.v2f64(i8* %1, i32 8)
- %vld4.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld4, 0
- %vld4.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld4, 1
- %vld4.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld4, 2
- %vld4.fca.3.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld4, 3
- %.fca.0.0.insert = insertvalue %struct.float64x2x4_t undef, <2 x double> %vld4.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.float64x2x4_t %.fca.0.0.insert, <2 x double> %vld4.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.float64x2x4_t %.fca.0.1.insert, <2 x double> %vld4.fca.2.extract, 0, 2
- %.fca.0.3.insert = insertvalue %struct.float64x2x4_t %.fca.0.2.insert, <2 x double> %vld4.fca.3.extract, 0, 3
- ret %struct.float64x2x4_t %.fca.0.3.insert
-}
-
-define %struct.int8x8x4_t @test_vld4_s8(i8* readonly %a) {
-; CHECK-LABEL: test_vld4_s8
-; CHECK: ld4 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
- %vld4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8* %a, i32 1)
- %vld4.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 0
- %vld4.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 1
- %vld4.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 2
- %vld4.fca.3.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 3
- %.fca.0.0.insert = insertvalue %struct.int8x8x4_t undef, <8 x i8> %vld4.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int8x8x4_t %.fca.0.0.insert, <8 x i8> %vld4.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int8x8x4_t %.fca.0.1.insert, <8 x i8> %vld4.fca.2.extract, 0, 2
- %.fca.0.3.insert = insertvalue %struct.int8x8x4_t %.fca.0.2.insert, <8 x i8> %vld4.fca.3.extract, 0, 3
- ret %struct.int8x8x4_t %.fca.0.3.insert
-}
-
-define %struct.int16x4x4_t @test_vld4_s16(i16* readonly %a) {
-; CHECK-LABEL: test_vld4_s16
-; CHECK: ld4 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
- %1 = bitcast i16* %a to i8*
- %vld4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16(i8* %1, i32 2)
- %vld4.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4, 0
- %vld4.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4, 1
- %vld4.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4, 2
- %vld4.fca.3.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4, 3
- %.fca.0.0.insert = insertvalue %struct.int16x4x4_t undef, <4 x i16> %vld4.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int16x4x4_t %.fca.0.0.insert, <4 x i16> %vld4.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int16x4x4_t %.fca.0.1.insert, <4 x i16> %vld4.fca.2.extract, 0, 2
- %.fca.0.3.insert = insertvalue %struct.int16x4x4_t %.fca.0.2.insert, <4 x i16> %vld4.fca.3.extract, 0, 3
- ret %struct.int16x4x4_t %.fca.0.3.insert
-}
-
-define %struct.int32x2x4_t @test_vld4_s32(i32* readonly %a) {
-; CHECK-LABEL: test_vld4_s32
-; CHECK: ld4 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
- %1 = bitcast i32* %a to i8*
- %vld4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32(i8* %1, i32 4)
- %vld4.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4, 0
- %vld4.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4, 1
- %vld4.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4, 2
- %vld4.fca.3.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4, 3
- %.fca.0.0.insert = insertvalue %struct.int32x2x4_t undef, <2 x i32> %vld4.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int32x2x4_t %.fca.0.0.insert, <2 x i32> %vld4.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int32x2x4_t %.fca.0.1.insert, <2 x i32> %vld4.fca.2.extract, 0, 2
- %.fca.0.3.insert = insertvalue %struct.int32x2x4_t %.fca.0.2.insert, <2 x i32> %vld4.fca.3.extract, 0, 3
- ret %struct.int32x2x4_t %.fca.0.3.insert
-}
-
-define %struct.int64x1x4_t @test_vld4_s64(i64* readonly %a) {
-; CHECK-LABEL: test_vld4_s64
-; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
- %1 = bitcast i64* %a to i8*
- %vld4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8* %1, i32 8)
- %vld4.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4, 0
- %vld4.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4, 1
- %vld4.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4, 2
- %vld4.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4, 3
- %.fca.0.0.insert = insertvalue %struct.int64x1x4_t undef, <1 x i64> %vld4.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int64x1x4_t %.fca.0.0.insert, <1 x i64> %vld4.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int64x1x4_t %.fca.0.1.insert, <1 x i64> %vld4.fca.2.extract, 0, 2
- %.fca.0.3.insert = insertvalue %struct.int64x1x4_t %.fca.0.2.insert, <1 x i64> %vld4.fca.3.extract, 0, 3
- ret %struct.int64x1x4_t %.fca.0.3.insert
-}
-
-define %struct.float32x2x4_t @test_vld4_f32(float* readonly %a) {
-; CHECK-LABEL: test_vld4_f32
-; CHECK: ld4 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
- %1 = bitcast float* %a to i8*
- %vld4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4.v2f32(i8* %1, i32 4)
- %vld4.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4, 0
- %vld4.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4, 1
- %vld4.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4, 2
- %vld4.fca.3.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4, 3
- %.fca.0.0.insert = insertvalue %struct.float32x2x4_t undef, <2 x float> %vld4.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.float32x2x4_t %.fca.0.0.insert, <2 x float> %vld4.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.float32x2x4_t %.fca.0.1.insert, <2 x float> %vld4.fca.2.extract, 0, 2
- %.fca.0.3.insert = insertvalue %struct.float32x2x4_t %.fca.0.2.insert, <2 x float> %vld4.fca.3.extract, 0, 3
- ret %struct.float32x2x4_t %.fca.0.3.insert
-}
-
-define %struct.float64x1x4_t @test_vld4_f64(double* readonly %a) {
-; CHECK-LABEL: test_vld4_f64
-; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
- %1 = bitcast double* %a to i8*
- %vld4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8* %1, i32 8)
- %vld4.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld4, 0
- %vld4.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld4, 1
- %vld4.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld4, 2
- %vld4.fca.3.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld4, 3
- %.fca.0.0.insert = insertvalue %struct.float64x1x4_t undef, <1 x double> %vld4.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.float64x1x4_t %.fca.0.0.insert, <1 x double> %vld4.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.float64x1x4_t %.fca.0.1.insert, <1 x double> %vld4.fca.2.extract, 0, 2
- %.fca.0.3.insert = insertvalue %struct.float64x1x4_t %.fca.0.2.insert, <1 x double> %vld4.fca.3.extract, 0, 3
- ret %struct.float64x1x4_t %.fca.0.3.insert
-}
-
-declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8*, i32)
-declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*, i32)
-declare <4 x i32> @llvm.arm.neon.vld1.v4i32(i8*, i32)
-declare <2 x i64> @llvm.arm.neon.vld1.v2i64(i8*, i32)
-declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32)
-declare <2 x double> @llvm.arm.neon.vld1.v2f64(i8*, i32)
-declare <8 x i8> @llvm.arm.neon.vld1.v8i8(i8*, i32)
-declare <4 x i16> @llvm.arm.neon.vld1.v4i16(i8*, i32)
-declare <2 x i32> @llvm.arm.neon.vld1.v2i32(i8*, i32)
-declare <1 x i64> @llvm.arm.neon.vld1.v1i64(i8*, i32)
-declare <2 x float> @llvm.arm.neon.vld1.v2f32(i8*, i32)
-declare <1 x double> @llvm.arm.neon.vld1.v1f64(i8*, i32)
-declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8*, i32)
-declare { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16(i8*, i32)
-declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32(i8*, i32)
-declare { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2.v2i64(i8*, i32)
-declare { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8*, i32)
-declare { <2 x double>, <2 x double> } @llvm.arm.neon.vld2.v2f64(i8*, i32)
-declare { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8(i8*, i32)
-declare { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16(i8*, i32)
-declare { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32(i8*, i32)
-declare { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8*, i32)
-declare { <2 x float>, <2 x float> } @llvm.arm.neon.vld2.v2f32(i8*, i32)
-declare { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8*, i32)
-declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8*, i32)
-declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16(i8*, i32)
-declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32(i8*, i32)
-declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3.v2i64(i8*, i32)
-declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32(i8*, i32)
-declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3.v2f64(i8*, i32)
-declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8(i8*, i32)
-declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8*, i32)
-declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32(i8*, i32)
-declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8*, i32)
-declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3.v2f32(i8*, i32)
-declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8*, i32)
-declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8*, i32)
-declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16(i8*, i32)
-declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32(i8*, i32)
-declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4.v2i64(i8*, i32)
-declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32(i8*, i32)
-declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4.v2f64(i8*, i32)
-declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8*, i32)
-declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16(i8*, i32)
-declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32(i8*, i32)
-declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8*, i32)
-declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4.v2f32(i8*, i32)
-declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8*, i32)
-
-define void @test_vst1q_s8(i8* %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vst1q_s8
-; CHECK: st1 {v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
- tail call void @llvm.arm.neon.vst1.v16i8(i8* %a, <16 x i8> %b, i32 1)
- ret void
-}
-
-define void @test_vst1q_s16(i16* %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vst1q_s16
-; CHECK: st1 {v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
- %1 = bitcast i16* %a to i8*
- tail call void @llvm.arm.neon.vst1.v8i16(i8* %1, <8 x i16> %b, i32 2)
- ret void
-}
-
-define void @test_vst1q_s32(i32* %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vst1q_s32
-; CHECK: st1 {v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
- %1 = bitcast i32* %a to i8*
- tail call void @llvm.arm.neon.vst1.v4i32(i8* %1, <4 x i32> %b, i32 4)
- ret void
-}
-
-define void @test_vst1q_s64(i64* %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vst1q_s64
-; CHECK: st1 {v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
- %1 = bitcast i64* %a to i8*
- tail call void @llvm.arm.neon.vst1.v2i64(i8* %1, <2 x i64> %b, i32 8)
- ret void
-}
-
-define void @test_vst1q_f32(float* %a, <4 x float> %b) {
-; CHECK-LABEL: test_vst1q_f32
-; CHECK: st1 {v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
- %1 = bitcast float* %a to i8*
- tail call void @llvm.arm.neon.vst1.v4f32(i8* %1, <4 x float> %b, i32 4)
- ret void
-}
-
-define void @test_vst1q_f64(double* %a, <2 x double> %b) {
-; CHECK-LABEL: test_vst1q_f64
-; CHECK: st1 {v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
- %1 = bitcast double* %a to i8*
- tail call void @llvm.arm.neon.vst1.v2f64(i8* %1, <2 x double> %b, i32 8)
- ret void
-}
-
-define void @test_vst1_s8(i8* %a, <8 x i8> %b) {
-; CHECK-LABEL: test_vst1_s8
-; CHECK: st1 {v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
- tail call void @llvm.arm.neon.vst1.v8i8(i8* %a, <8 x i8> %b, i32 1)
- ret void
-}
-
-define void @test_vst1_s16(i16* %a, <4 x i16> %b) {
-; CHECK-LABEL: test_vst1_s16
-; CHECK: st1 {v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
- %1 = bitcast i16* %a to i8*
- tail call void @llvm.arm.neon.vst1.v4i16(i8* %1, <4 x i16> %b, i32 2)
- ret void
-}
-
-define void @test_vst1_s32(i32* %a, <2 x i32> %b) {
-; CHECK-LABEL: test_vst1_s32
-; CHECK: st1 {v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
- %1 = bitcast i32* %a to i8*
- tail call void @llvm.arm.neon.vst1.v2i32(i8* %1, <2 x i32> %b, i32 4)
- ret void
-}
-
-define void @test_vst1_s64(i64* %a, <1 x i64> %b) {
-; CHECK-LABEL: test_vst1_s64
-; CHECK: st1 {v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
- %1 = bitcast i64* %a to i8*
- tail call void @llvm.arm.neon.vst1.v1i64(i8* %1, <1 x i64> %b, i32 8)
- ret void
-}
-
-define void @test_vst1_f32(float* %a, <2 x float> %b) {
-; CHECK-LABEL: test_vst1_f32
-; CHECK: st1 {v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
- %1 = bitcast float* %a to i8*
- tail call void @llvm.arm.neon.vst1.v2f32(i8* %1, <2 x float> %b, i32 4)
- ret void
-}
-
-define void @test_vst1_f64(double* %a, <1 x double> %b) {
-; CHECK-LABEL: test_vst1_f64
-; CHECK: st1 {v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
- %1 = bitcast double* %a to i8*
- tail call void @llvm.arm.neon.vst1.v1f64(i8* %1, <1 x double> %b, i32 8)
- ret void
-}
-
-define void @test_vst2q_s8(i8* %a, [2 x <16 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vst2q_s8
-; CHECK: st2 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %b.coerce, 1
- tail call void @llvm.arm.neon.vst2.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, i32 1)
- ret void
-}
-
-define void @test_vst2q_s16(i16* %a, [2 x <8 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vst2q_s16
-; CHECK: st2 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [2 x <8 x i16>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [2 x <8 x i16>] %b.coerce, 1
- %1 = bitcast i16* %a to i8*
- tail call void @llvm.arm.neon.vst2.v8i16(i8* %1, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, i32 2)
- ret void
-}
-
-define void @test_vst2q_s32(i32* %a, [2 x <4 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vst2q_s32
-; CHECK: st2 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %b.coerce, 1
- %1 = bitcast i32* %a to i8*
- tail call void @llvm.arm.neon.vst2.v4i32(i8* %1, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, i32 4)
- ret void
-}
-
-define void @test_vst2q_s64(i64* %a, [2 x <2 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vst2q_s64
-; CHECK: st2 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [2 x <2 x i64>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [2 x <2 x i64>] %b.coerce, 1
- %1 = bitcast i64* %a to i8*
- tail call void @llvm.arm.neon.vst2.v2i64(i8* %1, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, i32 8)
- ret void
-}
-
-define void @test_vst2q_f32(float* %a, [2 x <4 x float>] %b.coerce) {
-; CHECK-LABEL: test_vst2q_f32
-; CHECK: st2 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [2 x <4 x float>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [2 x <4 x float>] %b.coerce, 1
- %1 = bitcast float* %a to i8*
- tail call void @llvm.arm.neon.vst2.v4f32(i8* %1, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, i32 4)
- ret void
-}
-
-define void @test_vst2q_f64(double* %a, [2 x <2 x double>] %b.coerce) {
-; CHECK-LABEL: test_vst2q_f64
-; CHECK: st2 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [2 x <2 x double>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [2 x <2 x double>] %b.coerce, 1
- %1 = bitcast double* %a to i8*
- tail call void @llvm.arm.neon.vst2.v2f64(i8* %1, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, i32 8)
- ret void
-}
-
-define void @test_vst2_s8(i8* %a, [2 x <8 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vst2_s8
-; CHECK: st2 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %b.coerce, 1
- tail call void @llvm.arm.neon.vst2.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, i32 1)
- ret void
-}
-
-define void @test_vst2_s16(i16* %a, [2 x <4 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vst2_s16
-; CHECK: st2 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [2 x <4 x i16>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [2 x <4 x i16>] %b.coerce, 1
- %1 = bitcast i16* %a to i8*
- tail call void @llvm.arm.neon.vst2.v4i16(i8* %1, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, i32 2)
- ret void
-}
-
-define void @test_vst2_s32(i32* %a, [2 x <2 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vst2_s32
-; CHECK: st2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %b.coerce, 1
- %1 = bitcast i32* %a to i8*
- tail call void @llvm.arm.neon.vst2.v2i32(i8* %1, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, i32 4)
- ret void
-}
-
-define void @test_vst2_s64(i64* %a, [2 x <1 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vst2_s64
-; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [2 x <1 x i64>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [2 x <1 x i64>] %b.coerce, 1
- %1 = bitcast i64* %a to i8*
- tail call void @llvm.arm.neon.vst2.v1i64(i8* %1, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, i32 8)
- ret void
-}
-
-define void @test_vst2_f32(float* %a, [2 x <2 x float>] %b.coerce) {
-; CHECK-LABEL: test_vst2_f32
-; CHECK: st2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [2 x <2 x float>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [2 x <2 x float>] %b.coerce, 1
- %1 = bitcast float* %a to i8*
- tail call void @llvm.arm.neon.vst2.v2f32(i8* %1, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, i32 4)
- ret void
-}
-
-define void @test_vst2_f64(double* %a, [2 x <1 x double>] %b.coerce) {
-; CHECK-LABEL: test_vst2_f64
-; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [2 x <1 x double>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [2 x <1 x double>] %b.coerce, 1
- %1 = bitcast double* %a to i8*
- tail call void @llvm.arm.neon.vst2.v1f64(i8* %1, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, i32 8)
- ret void
-}
-
-define void @test_vst3q_s8(i8* %a, [3 x <16 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vst3q_s8
-; CHECK: st3 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %b.coerce, 2
- tail call void @llvm.arm.neon.vst3.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, i32 1)
- ret void
-}
-
-define void @test_vst3q_s16(i16* %a, [3 x <8 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vst3q_s16
-; CHECK: st3 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [3 x <8 x i16>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [3 x <8 x i16>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [3 x <8 x i16>] %b.coerce, 2
- %1 = bitcast i16* %a to i8*
- tail call void @llvm.arm.neon.vst3.v8i16(i8* %1, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, i32 2)
- ret void
-}
-
-define void @test_vst3q_s32(i32* %a, [3 x <4 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vst3q_s32
-; CHECK: st3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %b.coerce, 2
- %1 = bitcast i32* %a to i8*
- tail call void @llvm.arm.neon.vst3.v4i32(i8* %1, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, i32 4)
- ret void
-}
-
-define void @test_vst3q_s64(i64* %a, [3 x <2 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vst3q_s64
-; CHECK: st3 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [3 x <2 x i64>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [3 x <2 x i64>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [3 x <2 x i64>] %b.coerce, 2
- %1 = bitcast i64* %a to i8*
- tail call void @llvm.arm.neon.vst3.v2i64(i8* %1, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, i32 8)
- ret void
-}
-
-define void @test_vst3q_f32(float* %a, [3 x <4 x float>] %b.coerce) {
-; CHECK-LABEL: test_vst3q_f32
-; CHECK: st3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [3 x <4 x float>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [3 x <4 x float>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [3 x <4 x float>] %b.coerce, 2
- %1 = bitcast float* %a to i8*
- tail call void @llvm.arm.neon.vst3.v4f32(i8* %1, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, i32 4)
- ret void
-}
-
-define void @test_vst3q_f64(double* %a, [3 x <2 x double>] %b.coerce) {
-; CHECK-LABEL: test_vst3q_f64
-; CHECK: st3 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [3 x <2 x double>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [3 x <2 x double>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [3 x <2 x double>] %b.coerce, 2
- %1 = bitcast double* %a to i8*
- tail call void @llvm.arm.neon.vst3.v2f64(i8* %1, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, i32 8)
- ret void
-}
-
-define void @test_vst3_s8(i8* %a, [3 x <8 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vst3_s8
-; CHECK: st3 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [3 x <8 x i8>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [3 x <8 x i8>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [3 x <8 x i8>] %b.coerce, 2
- tail call void @llvm.arm.neon.vst3.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, i32 1)
- ret void
-}
-
-define void @test_vst3_s16(i16* %a, [3 x <4 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vst3_s16
-; CHECK: st3 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [3 x <4 x i16>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [3 x <4 x i16>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [3 x <4 x i16>] %b.coerce, 2
- %1 = bitcast i16* %a to i8*
- tail call void @llvm.arm.neon.vst3.v4i16(i8* %1, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, i32 2)
- ret void
-}
-
-define void @test_vst3_s32(i32* %a, [3 x <2 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vst3_s32
-; CHECK: st3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %b.coerce, 2
- %1 = bitcast i32* %a to i8*
- tail call void @llvm.arm.neon.vst3.v2i32(i8* %1, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, i32 4)
- ret void
-}
-
-define void @test_vst3_s64(i64* %a, [3 x <1 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vst3_s64
-; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [3 x <1 x i64>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [3 x <1 x i64>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [3 x <1 x i64>] %b.coerce, 2
- %1 = bitcast i64* %a to i8*
- tail call void @llvm.arm.neon.vst3.v1i64(i8* %1, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, i32 8)
- ret void
-}
-
-define void @test_vst3_f32(float* %a, [3 x <2 x float>] %b.coerce) {
-; CHECK-LABEL: test_vst3_f32
-; CHECK: st3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [3 x <2 x float>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [3 x <2 x float>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [3 x <2 x float>] %b.coerce, 2
- %1 = bitcast float* %a to i8*
- tail call void @llvm.arm.neon.vst3.v2f32(i8* %1, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, i32 4)
- ret void
-}
-
-define void @test_vst3_f64(double* %a, [3 x <1 x double>] %b.coerce) {
-; CHECK-LABEL: test_vst3_f64
-; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [3 x <1 x double>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [3 x <1 x double>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [3 x <1 x double>] %b.coerce, 2
- %1 = bitcast double* %a to i8*
- tail call void @llvm.arm.neon.vst3.v1f64(i8* %1, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, i32 8)
- ret void
-}
-
-define void @test_vst4q_s8(i8* %a, [4 x <16 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vst4q_s8
-; CHECK: st4 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [4 x <16 x i8>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [4 x <16 x i8>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [4 x <16 x i8>] %b.coerce, 2
- %b.coerce.fca.3.extract = extractvalue [4 x <16 x i8>] %b.coerce, 3
- tail call void @llvm.arm.neon.vst4.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, <16 x i8> %b.coerce.fca.3.extract, i32 1)
- ret void
-}
-
-define void @test_vst4q_s16(i16* %a, [4 x <8 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vst4q_s16
-; CHECK: st4 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [4 x <8 x i16>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [4 x <8 x i16>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [4 x <8 x i16>] %b.coerce, 2
- %b.coerce.fca.3.extract = extractvalue [4 x <8 x i16>] %b.coerce, 3
- %1 = bitcast i16* %a to i8*
- tail call void @llvm.arm.neon.vst4.v8i16(i8* %1, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, <8 x i16> %b.coerce.fca.3.extract, i32 2)
- ret void
-}
-
-define void @test_vst4q_s32(i32* %a, [4 x <4 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vst4q_s32
-; CHECK: st4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %b.coerce, 2
- %b.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %b.coerce, 3
- %1 = bitcast i32* %a to i8*
- tail call void @llvm.arm.neon.vst4.v4i32(i8* %1, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, <4 x i32> %b.coerce.fca.3.extract, i32 4)
- ret void
-}
-
-define void @test_vst4q_s64(i64* %a, [4 x <2 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vst4q_s64
-; CHECK: st4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [4 x <2 x i64>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [4 x <2 x i64>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [4 x <2 x i64>] %b.coerce, 2
- %b.coerce.fca.3.extract = extractvalue [4 x <2 x i64>] %b.coerce, 3
- %1 = bitcast i64* %a to i8*
- tail call void @llvm.arm.neon.vst4.v2i64(i8* %1, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, <2 x i64> %b.coerce.fca.3.extract, i32 8)
- ret void
-}
-
-define void @test_vst4q_f32(float* %a, [4 x <4 x float>] %b.coerce) {
-; CHECK-LABEL: test_vst4q_f32
-; CHECK: st4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [4 x <4 x float>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [4 x <4 x float>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [4 x <4 x float>] %b.coerce, 2
- %b.coerce.fca.3.extract = extractvalue [4 x <4 x float>] %b.coerce, 3
- %1 = bitcast float* %a to i8*
- tail call void @llvm.arm.neon.vst4.v4f32(i8* %1, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, <4 x float> %b.coerce.fca.3.extract, i32 4)
- ret void
-}
-
-define void @test_vst4q_f64(double* %a, [4 x <2 x double>] %b.coerce) {
-; CHECK-LABEL: test_vst4q_f64
-; CHECK: st4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [4 x <2 x double>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [4 x <2 x double>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [4 x <2 x double>] %b.coerce, 2
- %b.coerce.fca.3.extract = extractvalue [4 x <2 x double>] %b.coerce, 3
- %1 = bitcast double* %a to i8*
- tail call void @llvm.arm.neon.vst4.v2f64(i8* %1, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, <2 x double> %b.coerce.fca.3.extract, i32 8)
- ret void
-}
-
-define void @test_vst4_s8(i8* %a, [4 x <8 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vst4_s8
-; CHECK: st4 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [4 x <8 x i8>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [4 x <8 x i8>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [4 x <8 x i8>] %b.coerce, 2
- %b.coerce.fca.3.extract = extractvalue [4 x <8 x i8>] %b.coerce, 3
- tail call void @llvm.arm.neon.vst4.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, <8 x i8> %b.coerce.fca.3.extract, i32 1)
- ret void
-}
-
-define void @test_vst4_s16(i16* %a, [4 x <4 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vst4_s16
-; CHECK: st4 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [4 x <4 x i16>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [4 x <4 x i16>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [4 x <4 x i16>] %b.coerce, 2
- %b.coerce.fca.3.extract = extractvalue [4 x <4 x i16>] %b.coerce, 3
- %1 = bitcast i16* %a to i8*
- tail call void @llvm.arm.neon.vst4.v4i16(i8* %1, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, <4 x i16> %b.coerce.fca.3.extract, i32 2)
- ret void
-}
-
-define void @test_vst4_s32(i32* %a, [4 x <2 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vst4_s32
-; CHECK: st4 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %b.coerce, 2
- %b.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %b.coerce, 3
- %1 = bitcast i32* %a to i8*
- tail call void @llvm.arm.neon.vst4.v2i32(i8* %1, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, <2 x i32> %b.coerce.fca.3.extract, i32 4)
- ret void
-}
-
-define void @test_vst4_s64(i64* %a, [4 x <1 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vst4_s64
-; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [4 x <1 x i64>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [4 x <1 x i64>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [4 x <1 x i64>] %b.coerce, 2
- %b.coerce.fca.3.extract = extractvalue [4 x <1 x i64>] %b.coerce, 3
- %1 = bitcast i64* %a to i8*
- tail call void @llvm.arm.neon.vst4.v1i64(i8* %1, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, <1 x i64> %b.coerce.fca.3.extract, i32 8)
- ret void
-}
-
-define void @test_vst4_f32(float* %a, [4 x <2 x float>] %b.coerce) {
-; CHECK-LABEL: test_vst4_f32
-; CHECK: st4 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [4 x <2 x float>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [4 x <2 x float>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [4 x <2 x float>] %b.coerce, 2
- %b.coerce.fca.3.extract = extractvalue [4 x <2 x float>] %b.coerce, 3
- %1 = bitcast float* %a to i8*
- tail call void @llvm.arm.neon.vst4.v2f32(i8* %1, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, <2 x float> %b.coerce.fca.3.extract, i32 4)
- ret void
-}
-
-define void @test_vst4_f64(double* %a, [4 x <1 x double>] %b.coerce) {
-; CHECK-LABEL: test_vst4_f64
-; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [4 x <1 x double>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [4 x <1 x double>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [4 x <1 x double>] %b.coerce, 2
- %b.coerce.fca.3.extract = extractvalue [4 x <1 x double>] %b.coerce, 3
- %1 = bitcast double* %a to i8*
- tail call void @llvm.arm.neon.vst4.v1f64(i8* %1, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, <1 x double> %b.coerce.fca.3.extract, i32 8)
- ret void
-}
-
-declare void @llvm.arm.neon.vst1.v16i8(i8*, <16 x i8>, i32)
-declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32)
-declare void @llvm.arm.neon.vst1.v4i32(i8*, <4 x i32>, i32)
-declare void @llvm.arm.neon.vst1.v2i64(i8*, <2 x i64>, i32)
-declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32)
-declare void @llvm.arm.neon.vst1.v2f64(i8*, <2 x double>, i32)
-declare void @llvm.arm.neon.vst1.v8i8(i8*, <8 x i8>, i32)
-declare void @llvm.arm.neon.vst1.v4i16(i8*, <4 x i16>, i32)
-declare void @llvm.arm.neon.vst1.v2i32(i8*, <2 x i32>, i32)
-declare void @llvm.arm.neon.vst1.v1i64(i8*, <1 x i64>, i32)
-declare void @llvm.arm.neon.vst1.v2f32(i8*, <2 x float>, i32)
-declare void @llvm.arm.neon.vst1.v1f64(i8*, <1 x double>, i32)
-declare void @llvm.arm.neon.vst2.v16i8(i8*, <16 x i8>, <16 x i8>, i32)
-declare void @llvm.arm.neon.vst2.v8i16(i8*, <8 x i16>, <8 x i16>, i32)
-declare void @llvm.arm.neon.vst2.v4i32(i8*, <4 x i32>, <4 x i32>, i32)
-declare void @llvm.arm.neon.vst2.v2i64(i8*, <2 x i64>, <2 x i64>, i32)
-declare void @llvm.arm.neon.vst2.v4f32(i8*, <4 x float>, <4 x float>, i32)
-declare void @llvm.arm.neon.vst2.v2f64(i8*, <2 x double>, <2 x double>, i32)
-declare void @llvm.arm.neon.vst2.v8i8(i8*, <8 x i8>, <8 x i8>, i32)
-declare void @llvm.arm.neon.vst2.v4i16(i8*, <4 x i16>, <4 x i16>, i32)
-declare void @llvm.arm.neon.vst2.v2i32(i8*, <2 x i32>, <2 x i32>, i32)
-declare void @llvm.arm.neon.vst2.v1i64(i8*, <1 x i64>, <1 x i64>, i32)
-declare void @llvm.arm.neon.vst2.v2f32(i8*, <2 x float>, <2 x float>, i32)
-declare void @llvm.arm.neon.vst2.v1f64(i8*, <1 x double>, <1 x double>, i32)
-declare void @llvm.arm.neon.vst3.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32)
-declare void @llvm.arm.neon.vst3.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32)
-declare void @llvm.arm.neon.vst3.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32)
-declare void @llvm.arm.neon.vst3.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32)
-declare void @llvm.arm.neon.vst3.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32)
-declare void @llvm.arm.neon.vst3.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32)
-declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32)
-declare void @llvm.arm.neon.vst3.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32)
-declare void @llvm.arm.neon.vst3.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32)
-declare void @llvm.arm.neon.vst3.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32)
-declare void @llvm.arm.neon.vst3.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32)
-declare void @llvm.arm.neon.vst3.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32)
-declare void @llvm.arm.neon.vst4.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32)
-declare void @llvm.arm.neon.vst4.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32)
-declare void @llvm.arm.neon.vst4.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32)
-declare void @llvm.arm.neon.vst4.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32)
-declare void @llvm.arm.neon.vst4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32)
-declare void @llvm.arm.neon.vst4.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32)
-declare void @llvm.arm.neon.vst4.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32)
-declare void @llvm.arm.neon.vst4.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32)
-declare void @llvm.arm.neon.vst4.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32)
-declare void @llvm.arm.neon.vst4.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32)
-declare void @llvm.arm.neon.vst4.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32)
-declare void @llvm.arm.neon.vst4.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32)
-
-define %struct.int8x16x2_t @test_vld1q_s8_x2(i8* %a) {
-; CHECK-LABEL: test_vld1q_s8_x2
-; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
- %1 = tail call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x2.v16i8(i8* %a, i32 1)
- %2 = extractvalue { <16 x i8>, <16 x i8> } %1, 0
- %3 = extractvalue { <16 x i8>, <16 x i8> } %1, 1
- %4 = insertvalue %struct.int8x16x2_t undef, <16 x i8> %2, 0, 0
- %5 = insertvalue %struct.int8x16x2_t %4, <16 x i8> %3, 0, 1
- ret %struct.int8x16x2_t %5
-}
-
-define %struct.int16x8x2_t @test_vld1q_s16_x2(i16* %a) {
-; CHECK-LABEL: test_vld1q_s16_x2
-; CHECK: ld1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
- %1 = bitcast i16* %a to i8*
- %2 = tail call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x2.v8i16(i8* %1, i32 2)
- %3 = extractvalue { <8 x i16>, <8 x i16> } %2, 0
- %4 = extractvalue { <8 x i16>, <8 x i16> } %2, 1
- %5 = insertvalue %struct.int16x8x2_t undef, <8 x i16> %3, 0, 0
- %6 = insertvalue %struct.int16x8x2_t %5, <8 x i16> %4, 0, 1
- ret %struct.int16x8x2_t %6
-}
-
-define %struct.int32x4x2_t @test_vld1q_s32_x2(i32* %a) {
-; CHECK-LABEL: test_vld1q_s32_x2
-; CHECK: ld1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
- %1 = bitcast i32* %a to i8*
- %2 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.vld1x2.v4i32(i8* %1, i32 4)
- %3 = extractvalue { <4 x i32>, <4 x i32> } %2, 0
- %4 = extractvalue { <4 x i32>, <4 x i32> } %2, 1
- %5 = insertvalue %struct.int32x4x2_t undef, <4 x i32> %3, 0, 0
- %6 = insertvalue %struct.int32x4x2_t %5, <4 x i32> %4, 0, 1
- ret %struct.int32x4x2_t %6
-}
-
-define %struct.int64x2x2_t @test_vld1q_s64_x2(i64* %a) {
-; CHECK-LABEL: test_vld1q_s64_x2
-; CHECK: ld1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
- %1 = bitcast i64* %a to i8*
- %2 = tail call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x2.v2i64(i8* %1, i32 8)
- %3 = extractvalue { <2 x i64>, <2 x i64> } %2, 0
- %4 = extractvalue { <2 x i64>, <2 x i64> } %2, 1
- %5 = insertvalue %struct.int64x2x2_t undef, <2 x i64> %3, 0, 0
- %6 = insertvalue %struct.int64x2x2_t %5, <2 x i64> %4, 0, 1
- ret %struct.int64x2x2_t %6
-}
-
-define %struct.float32x4x2_t @test_vld1q_f32_x2(float* %a) {
-; CHECK-LABEL: test_vld1q_f32_x2
-; CHECK: ld1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
- %1 = bitcast float* %a to i8*
- %2 = tail call { <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x2.v4f32(i8* %1, i32 4)
- %3 = extractvalue { <4 x float>, <4 x float> } %2, 0
- %4 = extractvalue { <4 x float>, <4 x float> } %2, 1
- %5 = insertvalue %struct.float32x4x2_t undef, <4 x float> %3, 0, 0
- %6 = insertvalue %struct.float32x4x2_t %5, <4 x float> %4, 0, 1
- ret %struct.float32x4x2_t %6
-}
-
-
-define %struct.float64x2x2_t @test_vld1q_f64_x2(double* %a) {
-; CHECK-LABEL: test_vld1q_f64_x2
-; CHECK: ld1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
- %1 = bitcast double* %a to i8*
- %2 = tail call { <2 x double>, <2 x double> } @llvm.aarch64.neon.vld1x2.v2f64(i8* %1, i32 8)
- %3 = extractvalue { <2 x double>, <2 x double> } %2, 0
- %4 = extractvalue { <2 x double>, <2 x double> } %2, 1
- %5 = insertvalue %struct.float64x2x2_t undef, <2 x double> %3, 0, 0
- %6 = insertvalue %struct.float64x2x2_t %5, <2 x double> %4, 0, 1
- ret %struct.float64x2x2_t %6
-}
-
-define %struct.int8x8x2_t @test_vld1_s8_x2(i8* %a) {
-; CHECK-LABEL: test_vld1_s8_x2
-; CHECK: ld1 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
- %1 = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x2.v8i8(i8* %a, i32 1)
- %2 = extractvalue { <8 x i8>, <8 x i8> } %1, 0
- %3 = extractvalue { <8 x i8>, <8 x i8> } %1, 1
- %4 = insertvalue %struct.int8x8x2_t undef, <8 x i8> %2, 0, 0
- %5 = insertvalue %struct.int8x8x2_t %4, <8 x i8> %3, 0, 1
- ret %struct.int8x8x2_t %5
-}
-
-define %struct.int16x4x2_t @test_vld1_s16_x2(i16* %a) {
-; CHECK-LABEL: test_vld1_s16_x2
-; CHECK: ld1 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
- %1 = bitcast i16* %a to i8*
- %2 = tail call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.vld1x2.v4i16(i8* %1, i32 2)
- %3 = extractvalue { <4 x i16>, <4 x i16> } %2, 0
- %4 = extractvalue { <4 x i16>, <4 x i16> } %2, 1
- %5 = insertvalue %struct.int16x4x2_t undef, <4 x i16> %3, 0, 0
- %6 = insertvalue %struct.int16x4x2_t %5, <4 x i16> %4, 0, 1
- ret %struct.int16x4x2_t %6
-}
-
-define %struct.int32x2x2_t @test_vld1_s32_x2(i32* %a) {
-; CHECK-LABEL: test_vld1_s32_x2
-; CHECK: ld1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
- %1 = bitcast i32* %a to i8*
- %2 = tail call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.vld1x2.v2i32(i8* %1, i32 4)
- %3 = extractvalue { <2 x i32>, <2 x i32> } %2, 0
- %4 = extractvalue { <2 x i32>, <2 x i32> } %2, 1
- %5 = insertvalue %struct.int32x2x2_t undef, <2 x i32> %3, 0, 0
- %6 = insertvalue %struct.int32x2x2_t %5, <2 x i32> %4, 0, 1
- ret %struct.int32x2x2_t %6
-}
-
-define %struct.int64x1x2_t @test_vld1_s64_x2(i64* %a) {
-; CHECK-LABEL: test_vld1_s64_x2
-; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
- %1 = bitcast i64* %a to i8*
- %2 = tail call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.vld1x2.v1i64(i8* %1, i32 8)
- %3 = extractvalue { <1 x i64>, <1 x i64> } %2, 0
- %4 = extractvalue { <1 x i64>, <1 x i64> } %2, 1
- %5 = insertvalue %struct.int64x1x2_t undef, <1 x i64> %3, 0, 0
- %6 = insertvalue %struct.int64x1x2_t %5, <1 x i64> %4, 0, 1
- ret %struct.int64x1x2_t %6
-}
-
-define %struct.float32x2x2_t @test_vld1_f32_x2(float* %a) {
-; CHECK-LABEL: test_vld1_f32_x2
-; CHECK: ld1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
- %1 = bitcast float* %a to i8*
- %2 = tail call { <2 x float>, <2 x float> } @llvm.aarch64.neon.vld1x2.v2f32(i8* %1, i32 4)
- %3 = extractvalue { <2 x float>, <2 x float> } %2, 0
- %4 = extractvalue { <2 x float>, <2 x float> } %2, 1
- %5 = insertvalue %struct.float32x2x2_t undef, <2 x float> %3, 0, 0
- %6 = insertvalue %struct.float32x2x2_t %5, <2 x float> %4, 0, 1
- ret %struct.float32x2x2_t %6
-}
-
-define %struct.float64x1x2_t @test_vld1_f64_x2(double* %a) {
-; CHECK-LABEL: test_vld1_f64_x2
-; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
- %1 = bitcast double* %a to i8*
- %2 = tail call { <1 x double>, <1 x double> } @llvm.aarch64.neon.vld1x2.v1f64(i8* %1, i32 8)
- %3 = extractvalue { <1 x double>, <1 x double> } %2, 0
- %4 = extractvalue { <1 x double>, <1 x double> } %2, 1
- %5 = insertvalue %struct.float64x1x2_t undef, <1 x double> %3, 0, 0
- %6 = insertvalue %struct.float64x1x2_t %5, <1 x double> %4, 0, 1
- ret %struct.float64x1x2_t %6
-}
-
-define %struct.int8x16x3_t @test_vld1q_s8_x3(i8* %a) {
-; CHECK-LABEL: test_vld1q_s8_x3
-; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b},
-; [{{x[0-9]+|sp}}]
- %1 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x3.v16i8(i8* %a, i32 1)
- %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %1, 0
- %3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %1, 1
- %4 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %1, 2
- %5 = insertvalue %struct.int8x16x3_t undef, <16 x i8> %2, 0, 0
- %6 = insertvalue %struct.int8x16x3_t %5, <16 x i8> %3, 0, 1
- %7 = insertvalue %struct.int8x16x3_t %6, <16 x i8> %4, 0, 2
- ret %struct.int8x16x3_t %7
-}
-
-define %struct.int16x8x3_t @test_vld1q_s16_x3(i16* %a) {
-; CHECK-LABEL: test_vld1q_s16_x3
-; CHECK: ld1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h},
-; [{{x[0-9]+|sp}}]
- %1 = bitcast i16* %a to i8*
- %2 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x3.v8i16(i8* %1, i32 2)
- %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %2, 0
- %4 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %2, 1
- %5 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %2, 2
- %6 = insertvalue %struct.int16x8x3_t undef, <8 x i16> %3, 0, 0
- %7 = insertvalue %struct.int16x8x3_t %6, <8 x i16> %4, 0, 1
- %8 = insertvalue %struct.int16x8x3_t %7, <8 x i16> %5, 0, 2
- ret %struct.int16x8x3_t %8
-}
-
-define %struct.int32x4x3_t @test_vld1q_s32_x3(i32* %a) {
-; CHECK-LABEL: test_vld1q_s32_x3
-; CHECK: ld1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s},
-; [{{x[0-9]+|sp}}]
- %1 = bitcast i32* %a to i8*
- %2 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.vld1x3.v4i32(i8* %1, i32 4)
- %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %2, 0
- %4 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %2, 1
- %5 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %2, 2
- %6 = insertvalue %struct.int32x4x3_t undef, <4 x i32> %3, 0, 0
- %7 = insertvalue %struct.int32x4x3_t %6, <4 x i32> %4, 0, 1
- %8 = insertvalue %struct.int32x4x3_t %7, <4 x i32> %5, 0, 2
- ret %struct.int32x4x3_t %8
-}
-
-define %struct.int64x2x3_t @test_vld1q_s64_x3(i64* %a) {
-; CHECK-LABEL: test_vld1q_s64_x3
-; CHECK: ld1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d},
-; [{{x[0-9]+|sp}}]
- %1 = bitcast i64* %a to i8*
- %2 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x3.v2i64(i8* %1, i32 8)
- %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %2, 0
- %4 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %2, 1
- %5 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %2, 2
- %6 = insertvalue %struct.int64x2x3_t undef, <2 x i64> %3, 0, 0
- %7 = insertvalue %struct.int64x2x3_t %6, <2 x i64> %4, 0, 1
- %8 = insertvalue %struct.int64x2x3_t %7, <2 x i64> %5, 0, 2
- ret %struct.int64x2x3_t %8
-}
-
-define %struct.float32x4x3_t @test_vld1q_f32_x3(float* %a) {
-; CHECK-LABEL: test_vld1q_f32_x3
-; CHECK: ld1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s},
-; [{{x[0-9]+|sp}}]
- %1 = bitcast float* %a to i8*
- %2 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x3.v4f32(i8* %1, i32 4)
- %3 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %2, 0
- %4 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %2, 1
- %5 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %2, 2
- %6 = insertvalue %struct.float32x4x3_t undef, <4 x float> %3, 0, 0
- %7 = insertvalue %struct.float32x4x3_t %6, <4 x float> %4, 0, 1
- %8 = insertvalue %struct.float32x4x3_t %7, <4 x float> %5, 0, 2
- ret %struct.float32x4x3_t %8
-}
-
-
-define %struct.float64x2x3_t @test_vld1q_f64_x3(double* %a) {
-; CHECK-LABEL: test_vld1q_f64_x3
-; CHECK: ld1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d},
-; [{{x[0-9]+|sp}}]
- %1 = bitcast double* %a to i8*
- %2 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.vld1x3.v2f64(i8* %1, i32 8)
- %3 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %2, 0
- %4 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %2, 1
- %5 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %2, 2
- %6 = insertvalue %struct.float64x2x3_t undef, <2 x double> %3, 0, 0
- %7 = insertvalue %struct.float64x2x3_t %6, <2 x double> %4, 0, 1
- %8 = insertvalue %struct.float64x2x3_t %7, <2 x double> %5, 0, 2
- ret %struct.float64x2x3_t %8
-}
-
-define %struct.int8x8x3_t @test_vld1_s8_x3(i8* %a) {
-; CHECK-LABEL: test_vld1_s8_x3
-; CHECK: ld1 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b},
-; [{{x[0-9]+|sp}}]
- %1 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x3.v8i8(i8* %a, i32 1)
- %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 0
- %3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 1
- %4 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 2
- %5 = insertvalue %struct.int8x8x3_t undef, <8 x i8> %2, 0, 0
- %6 = insertvalue %struct.int8x8x3_t %5, <8 x i8> %3, 0, 1
- %7 = insertvalue %struct.int8x8x3_t %6, <8 x i8> %4, 0, 2
- ret %struct.int8x8x3_t %7
-}
-
-define %struct.int16x4x3_t @test_vld1_s16_x3(i16* %a) {
-; CHECK-LABEL: test_vld1_s16_x3
-; CHECK: ld1 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h},
-; [{{x[0-9]+|sp}}]
- %1 = bitcast i16* %a to i8*
- %2 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.vld1x3.v4i16(i8* %1, i32 2)
- %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 0
- %4 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 1
- %5 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 2
- %6 = insertvalue %struct.int16x4x3_t undef, <4 x i16> %3, 0, 0
- %7 = insertvalue %struct.int16x4x3_t %6, <4 x i16> %4, 0, 1
- %8 = insertvalue %struct.int16x4x3_t %7, <4 x i16> %5, 0, 2
- ret %struct.int16x4x3_t %8
-}
-
-define %struct.int32x2x3_t @test_vld1_s32_x3(i32* %a) {
- %1 = bitcast i32* %a to i8*
-; CHECK-LABEL: test_vld1_s32_x3
-; CHECK: ld1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s},
-; [{{x[0-9]+|sp}}]
- %2 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.vld1x3.v2i32(i8* %1, i32 4)
- %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %2, 0
- %4 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %2, 1
- %5 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %2, 2
- %6 = insertvalue %struct.int32x2x3_t undef, <2 x i32> %3, 0, 0
- %7 = insertvalue %struct.int32x2x3_t %6, <2 x i32> %4, 0, 1
- %8 = insertvalue %struct.int32x2x3_t %7, <2 x i32> %5, 0, 2
- ret %struct.int32x2x3_t %8
-}
-
-define %struct.int64x1x3_t @test_vld1_s64_x3(i64* %a) {
-; CHECK-LABEL: test_vld1_s64_x3
-; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d},
-; [{{x[0-9]+|sp}}]
- %1 = bitcast i64* %a to i8*
- %2 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.vld1x3.v1i64(i8* %1, i32 8)
- %3 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %2, 0
- %4 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %2, 1
- %5 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %2, 2
- %6 = insertvalue %struct.int64x1x3_t undef, <1 x i64> %3, 0, 0
- %7 = insertvalue %struct.int64x1x3_t %6, <1 x i64> %4, 0, 1
- %8 = insertvalue %struct.int64x1x3_t %7, <1 x i64> %5, 0, 2
- ret %struct.int64x1x3_t %8
-}
-
-define %struct.float32x2x3_t @test_vld1_f32_x3(float* %a) {
-; CHECK-LABEL: test_vld1_f32_x3
-; CHECK: ld1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s},
-; [{{x[0-9]+|sp}}]
- %1 = bitcast float* %a to i8*
- %2 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.vld1x3.v2f32(i8* %1, i32 4)
- %3 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %2, 0
- %4 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %2, 1
- %5 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %2, 2
- %6 = insertvalue %struct.float32x2x3_t undef, <2 x float> %3, 0, 0
- %7 = insertvalue %struct.float32x2x3_t %6, <2 x float> %4, 0, 1
- %8 = insertvalue %struct.float32x2x3_t %7, <2 x float> %5, 0, 2
- ret %struct.float32x2x3_t %8
-}
-
-
-define %struct.float64x1x3_t @test_vld1_f64_x3(double* %a) {
-; CHECK-LABEL: test_vld1_f64_x3
-; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d},
-; [{{x[0-9]+|sp}}]
- %1 = bitcast double* %a to i8*
- %2 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.vld1x3.v1f64(i8* %1, i32 8)
- %3 = extractvalue { <1 x double>, <1 x double>, <1 x double> } %2, 0
- %4 = extractvalue { <1 x double>, <1 x double>, <1 x double> } %2, 1
- %5 = extractvalue { <1 x double>, <1 x double>, <1 x double> } %2, 2
- %6 = insertvalue %struct.float64x1x3_t undef, <1 x double> %3, 0, 0
- %7 = insertvalue %struct.float64x1x3_t %6, <1 x double> %4, 0, 1
- %8 = insertvalue %struct.float64x1x3_t %7, <1 x double> %5, 0, 2
- ret %struct.float64x1x3_t %8
-}
-
-define %struct.int8x16x4_t @test_vld1q_s8_x4(i8* %a) {
-; CHECK-LABEL: test_vld1q_s8_x4
-; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b,
-; v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
- %1 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x4.v16i8(i8* %a, i32 1)
- %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %1, 0
- %3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %1, 1
- %4 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %1, 2
- %5 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %1, 3
- %6 = insertvalue %struct.int8x16x4_t undef, <16 x i8> %2, 0, 0
- %7 = insertvalue %struct.int8x16x4_t %6, <16 x i8> %3, 0, 1
- %8 = insertvalue %struct.int8x16x4_t %7, <16 x i8> %4, 0, 2
- %9 = insertvalue %struct.int8x16x4_t %8, <16 x i8> %5, 0, 3
- ret %struct.int8x16x4_t %9
-}
-
-define %struct.int16x8x4_t @test_vld1q_s16_x4(i16* %a) {
-; CHECK-LABEL: test_vld1q_s16_x4
-; CHECK: ld1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h,
-; v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
- %1 = bitcast i16* %a to i8*
- %2 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x4.v8i16(i8* %1, i32 2)
- %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %2, 0
- %4 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %2, 1
- %5 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %2, 2
- %6 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %2, 3
- %7 = insertvalue %struct.int16x8x4_t undef, <8 x i16> %3, 0, 0
- %8 = insertvalue %struct.int16x8x4_t %7, <8 x i16> %4, 0, 1
- %9 = insertvalue %struct.int16x8x4_t %8, <8 x i16> %5, 0, 2
- %10 = insertvalue %struct.int16x8x4_t %9, <8 x i16> %6, 0, 3
- ret %struct.int16x8x4_t %10
-}
-
-define %struct.int32x4x4_t @test_vld1q_s32_x4(i32* %a) {
-; CHECK-LABEL: test_vld1q_s32_x4
-; CHECK: ld1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s,
-; v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
- %1 = bitcast i32* %a to i8*
- %2 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.vld1x4.v4i32(i8* %1, i32 4)
- %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %2, 0
- %4 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %2, 1
- %5 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %2, 2
- %6 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %2, 3
- %7 = insertvalue %struct.int32x4x4_t undef, <4 x i32> %3, 0, 0
- %8 = insertvalue %struct.int32x4x4_t %7, <4 x i32> %4, 0, 1
- %9 = insertvalue %struct.int32x4x4_t %8, <4 x i32> %5, 0, 2
- %10 = insertvalue %struct.int32x4x4_t %9, <4 x i32> %6, 0, 3
- ret %struct.int32x4x4_t %10
-}
-
-define %struct.int64x2x4_t @test_vld1q_s64_x4(i64* %a) {
-; CHECK-LABEL: test_vld1q_s64_x4
-; CHECK: ld1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d,
-; v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
- %1 = bitcast i64* %a to i8*
- %2 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x4.v2i64(i8* %1, i32 8)
- %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %2, 0
- %4 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %2, 1
- %5 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %2, 2
- %6 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %2, 3
- %7 = insertvalue %struct.int64x2x4_t undef, <2 x i64> %3, 0, 0
- %8 = insertvalue %struct.int64x2x4_t %7, <2 x i64> %4, 0, 1
- %9 = insertvalue %struct.int64x2x4_t %8, <2 x i64> %5, 0, 2
- %10 = insertvalue %struct.int64x2x4_t %9, <2 x i64> %6, 0, 3
- ret %struct.int64x2x4_t %10
-}
-
-define %struct.float32x4x4_t @test_vld1q_f32_x4(float* %a) {
-; CHECK-LABEL: test_vld1q_f32_x4
-; CHECK: ld1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s,
-; v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
- %1 = bitcast float* %a to i8*
- %2 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x4.v4f32(i8* %1, i32 4)
- %3 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 0
- %4 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 1
- %5 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 2
- %6 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 3
- %7 = insertvalue %struct.float32x4x4_t undef, <4 x float> %3, 0, 0
- %8 = insertvalue %struct.float32x4x4_t %7, <4 x float> %4, 0, 1
- %9 = insertvalue %struct.float32x4x4_t %8, <4 x float> %5, 0, 2
- %10 = insertvalue %struct.float32x4x4_t %9, <4 x float> %6, 0, 3
- ret %struct.float32x4x4_t %10
-}
-
-define %struct.float64x2x4_t @test_vld1q_f64_x4(double* %a) {
-; CHECK-LABEL: test_vld1q_f64_x4
-; CHECK: ld1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d,
-; v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
- %1 = bitcast double* %a to i8*
- %2 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.vld1x4.v2f64(i8* %1, i32 8)
- %3 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 0
- %4 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 1
- %5 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 2
- %6 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 3
- %7 = insertvalue %struct.float64x2x4_t undef, <2 x double> %3, 0, 0
- %8 = insertvalue %struct.float64x2x4_t %7, <2 x double> %4, 0, 1
- %9 = insertvalue %struct.float64x2x4_t %8, <2 x double> %5, 0, 2
- %10 = insertvalue %struct.float64x2x4_t %9, <2 x double> %6, 0, 3
- ret %struct.float64x2x4_t %10
-}
-
-define %struct.int8x8x4_t @test_vld1_s8_x4(i8* %a) {
-; CHECK-LABEL: test_vld1_s8_x4
-; CHECK: ld1 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b,
-; v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
- %1 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x4.v8i8(i8* %a, i32 1)
- %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 0
- %3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 1
- %4 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 2
- %5 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 3
- %6 = insertvalue %struct.int8x8x4_t undef, <8 x i8> %2, 0, 0
- %7 = insertvalue %struct.int8x8x4_t %6, <8 x i8> %3, 0, 1
- %8 = insertvalue %struct.int8x8x4_t %7, <8 x i8> %4, 0, 2
- %9 = insertvalue %struct.int8x8x4_t %8, <8 x i8> %5, 0, 3
- ret %struct.int8x8x4_t %9
-}
-
-define %struct.int16x4x4_t @test_vld1_s16_x4(i16* %a) {
-; CHECK-LABEL: test_vld1_s16_x4
-; CHECK: ld1 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h,
-; v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
- %1 = bitcast i16* %a to i8*
- %2 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.vld1x4.v4i16(i8* %1, i32 2)
- %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %2, 0
- %4 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %2, 1
- %5 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %2, 2
- %6 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %2, 3
- %7 = insertvalue %struct.int16x4x4_t undef, <4 x i16> %3, 0, 0
- %8 = insertvalue %struct.int16x4x4_t %7, <4 x i16> %4, 0, 1
- %9 = insertvalue %struct.int16x4x4_t %8, <4 x i16> %5, 0, 2
- %10 = insertvalue %struct.int16x4x4_t %9, <4 x i16> %6, 0, 3
- ret %struct.int16x4x4_t %10
-}
-
-define %struct.int32x2x4_t @test_vld1_s32_x4(i32* %a) {
-; CHECK-LABEL: test_vld1_s32_x4
-; CHECK: ld1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s,
-; v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
- %1 = bitcast i32* %a to i8*
- %2 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.vld1x4.v2i32(i8* %1, i32 4)
- %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 0
- %4 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 1
- %5 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 2
- %6 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 3
- %7 = insertvalue %struct.int32x2x4_t undef, <2 x i32> %3, 0, 0
- %8 = insertvalue %struct.int32x2x4_t %7, <2 x i32> %4, 0, 1
- %9 = insertvalue %struct.int32x2x4_t %8, <2 x i32> %5, 0, 2
- %10 = insertvalue %struct.int32x2x4_t %9, <2 x i32> %6, 0, 3
- ret %struct.int32x2x4_t %10
-}
-
-define %struct.int64x1x4_t @test_vld1_s64_x4(i64* %a) {
-; CHECK-LABEL: test_vld1_s64_x4
-; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d,
-; v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
- %1 = bitcast i64* %a to i8*
- %2 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.vld1x4.v1i64(i8* %1, i32 8)
- %3 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %2, 0
- %4 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %2, 1
- %5 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %2, 2
- %6 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %2, 3
- %7 = insertvalue %struct.int64x1x4_t undef, <1 x i64> %3, 0, 0
- %8 = insertvalue %struct.int64x1x4_t %7, <1 x i64> %4, 0, 1
- %9 = insertvalue %struct.int64x1x4_t %8, <1 x i64> %5, 0, 2
- %10 = insertvalue %struct.int64x1x4_t %9, <1 x i64> %6, 0, 3
- ret %struct.int64x1x4_t %10
-}
-
-define %struct.float32x2x4_t @test_vld1_f32_x4(float* %a) {
-; CHECK-LABEL: test_vld1_f32_x4
-; CHECK: ld1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s,
-; v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
- %1 = bitcast float* %a to i8*
- %2 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.vld1x4.v2f32(i8* %1, i32 4)
- %3 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %2, 0
- %4 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %2, 1
- %5 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %2, 2
- %6 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %2, 3
- %7 = insertvalue %struct.float32x2x4_t undef, <2 x float> %3, 0, 0
- %8 = insertvalue %struct.float32x2x4_t %7, <2 x float> %4, 0, 1
- %9 = insertvalue %struct.float32x2x4_t %8, <2 x float> %5, 0, 2
- %10 = insertvalue %struct.float32x2x4_t %9, <2 x float> %6, 0, 3
- ret %struct.float32x2x4_t %10
-}
-
-
-define %struct.float64x1x4_t @test_vld1_f64_x4(double* %a) {
-; CHECK-LABEL: test_vld1_f64_x4
-; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d,
-; v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
- %1 = bitcast double* %a to i8*
- %2 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.vld1x4.v1f64(i8* %1, i32 8)
- %3 = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %2, 0
- %4 = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %2, 1
- %5 = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %2, 2
- %6 = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %2, 3
- %7 = insertvalue %struct.float64x1x4_t undef, <1 x double> %3, 0, 0
- %8 = insertvalue %struct.float64x1x4_t %7, <1 x double> %4, 0, 1
- %9 = insertvalue %struct.float64x1x4_t %8, <1 x double> %5, 0, 2
- %10 = insertvalue %struct.float64x1x4_t %9, <1 x double> %6, 0, 3
- ret %struct.float64x1x4_t %10
-}
-
-define void @test_vst1q_s8_x2(i8* %a, [2 x <16 x i8>] %b) {
-; CHECK-LABEL: test_vst1q_s8_x2
-; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
- %1 = extractvalue [2 x <16 x i8>] %b, 0
- %2 = extractvalue [2 x <16 x i8>] %b, 1
- tail call void @llvm.aarch64.neon.vst1x2.v16i8(i8* %a, <16 x i8> %1, <16 x i8> %2, i32 1)
- ret void
-}
-
-define void @test_vst1q_s16_x2(i16* %a, [2 x <8 x i16>] %b) {
-; CHECK-LABEL: test_vst1q_s16_x2
-; CHECK: st1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
- %1 = extractvalue [2 x <8 x i16>] %b, 0
- %2 = extractvalue [2 x <8 x i16>] %b, 1
- %3 = bitcast i16* %a to i8*
- tail call void @llvm.aarch64.neon.vst1x2.v8i16(i8* %3, <8 x i16> %1, <8 x i16> %2, i32 2)
- ret void
-}
-
-define void @test_vst1q_s32_x2(i32* %a, [2 x <4 x i32>] %b) {
-; CHECK-LABEL: test_vst1q_s32_x2
-; CHECK: st1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
- %1 = extractvalue [2 x <4 x i32>] %b, 0
- %2 = extractvalue [2 x <4 x i32>] %b, 1
- %3 = bitcast i32* %a to i8*
- tail call void @llvm.aarch64.neon.vst1x2.v4i32(i8* %3, <4 x i32> %1, <4 x i32> %2, i32 4)
- ret void
-}
-
-define void @test_vst1q_s64_x2(i64* %a, [2 x <2 x i64>] %b) {
-; CHECK-LABEL: test_vst1q_s64_x2
-; CHECK: st1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
- %1 = extractvalue [2 x <2 x i64>] %b, 0
- %2 = extractvalue [2 x <2 x i64>] %b, 1
- %3 = bitcast i64* %a to i8*
- tail call void @llvm.aarch64.neon.vst1x2.v2i64(i8* %3, <2 x i64> %1, <2 x i64> %2, i32 8)
- ret void
-}
-
-define void @test_vst1q_f32_x2(float* %a, [2 x <4 x float>] %b) {
-; CHECK-LABEL: test_vst1q_f32_x2
-; CHECK: st1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
- %1 = extractvalue [2 x <4 x float>] %b, 0
- %2 = extractvalue [2 x <4 x float>] %b, 1
- %3 = bitcast float* %a to i8*
- tail call void @llvm.aarch64.neon.vst1x2.v4f32(i8* %3, <4 x float> %1, <4 x float> %2, i32 4)
- ret void
-}
-
-
-define void @test_vst1q_f64_x2(double* %a, [2 x <2 x double>] %b) {
-; CHECK-LABEL: test_vst1q_f64_x2
-; CHECK: st1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
- %1 = extractvalue [2 x <2 x double>] %b, 0
- %2 = extractvalue [2 x <2 x double>] %b, 1
- %3 = bitcast double* %a to i8*
- tail call void @llvm.aarch64.neon.vst1x2.v2f64(i8* %3, <2 x double> %1, <2 x double> %2, i32 8)
- ret void
-}
-
-define void @test_vst1_s8_x2(i8* %a, [2 x <8 x i8>] %b) {
-; CHECK-LABEL: test_vst1_s8_x2
-; CHECK: st1 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
- %1 = extractvalue [2 x <8 x i8>] %b, 0
- %2 = extractvalue [2 x <8 x i8>] %b, 1
- tail call void @llvm.aarch64.neon.vst1x2.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, i32 1)
- ret void
-}
-
-define void @test_vst1_s16_x2(i16* %a, [2 x <4 x i16>] %b) {
-; CHECK-LABEL: test_vst1_s16_x2
-; CHECK: st1 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
- %1 = extractvalue [2 x <4 x i16>] %b, 0
- %2 = extractvalue [2 x <4 x i16>] %b, 1
- %3 = bitcast i16* %a to i8*
- tail call void @llvm.aarch64.neon.vst1x2.v4i16(i8* %3, <4 x i16> %1, <4 x i16> %2, i32 2)
- ret void
-}
-
-define void @test_vst1_s32_x2(i32* %a, [2 x <2 x i32>] %b) {
-; CHECK-LABEL: test_vst1_s32_x2
-; CHECK: st1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
- %1 = extractvalue [2 x <2 x i32>] %b, 0
- %2 = extractvalue [2 x <2 x i32>] %b, 1
- %3 = bitcast i32* %a to i8*
- tail call void @llvm.aarch64.neon.vst1x2.v2i32(i8* %3, <2 x i32> %1, <2 x i32> %2, i32 4)
- ret void
-}
-
-define void @test_vst1_s64_x2(i64* %a, [2 x <1 x i64>] %b) {
-; CHECK-LABEL: test_vst1_s64_x2
-; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
- %1 = extractvalue [2 x <1 x i64>] %b, 0
- %2 = extractvalue [2 x <1 x i64>] %b, 1
- %3 = bitcast i64* %a to i8*
- tail call void @llvm.aarch64.neon.vst1x2.v1i64(i8* %3, <1 x i64> %1, <1 x i64> %2, i32 8)
- ret void
-}
-
-define void @test_vst1_f32_x2(float* %a, [2 x <2 x float>] %b) {
-; CHECK-LABEL: test_vst1_f32_x2
-; CHECK: st1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
- %1 = extractvalue [2 x <2 x float>] %b, 0
- %2 = extractvalue [2 x <2 x float>] %b, 1
- %3 = bitcast float* %a to i8*
- tail call void @llvm.aarch64.neon.vst1x2.v2f32(i8* %3, <2 x float> %1, <2 x float> %2, i32 4)
- ret void
-}
-
-define void @test_vst1_f64_x2(double* %a, [2 x <1 x double>] %b) {
-; CHECK-LABEL: test_vst1_f64_x2
-; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
- %1 = extractvalue [2 x <1 x double>] %b, 0
- %2 = extractvalue [2 x <1 x double>] %b, 1
- %3 = bitcast double* %a to i8*
- tail call void @llvm.aarch64.neon.vst1x2.v1f64(i8* %3, <1 x double> %1, <1 x double> %2, i32 8)
- ret void
-}
-
-define void @test_vst1q_s8_x3(i8* %a, [3 x <16 x i8>] %b) {
-; CHECK-LABEL: test_vst1q_s8_x3
-; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b},
-; [{{x[0-9]+|sp}}]
- %1 = extractvalue [3 x <16 x i8>] %b, 0
- %2 = extractvalue [3 x <16 x i8>] %b, 1
- %3 = extractvalue [3 x <16 x i8>] %b, 2
- tail call void @llvm.aarch64.neon.vst1x3.v16i8(i8* %a, <16 x i8> %1, <16 x i8> %2, <16 x i8> %3, i32 1)
- ret void
-}
-
-define void @test_vst1q_s16_x3(i16* %a, [3 x <8 x i16>] %b) {
-; CHECK-LABEL: test_vst1q_s16_x3
-; CHECK: st1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h},
-; [{{x[0-9]+|sp}}]
- %1 = extractvalue [3 x <8 x i16>] %b, 0
- %2 = extractvalue [3 x <8 x i16>] %b, 1
- %3 = extractvalue [3 x <8 x i16>] %b, 2
- %4 = bitcast i16* %a to i8*
- tail call void @llvm.aarch64.neon.vst1x3.v8i16(i8* %4, <8 x i16> %1, <8 x i16> %2, <8 x i16> %3, i32 2)
- ret void
-}
-
-define void @test_vst1q_s32_x3(i32* %a, [3 x <4 x i32>] %b) {
-; CHECK-LABEL: test_vst1q_s32_x3
-; CHECK: st1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s},
-; [{{x[0-9]+|sp}}]
- %1 = extractvalue [3 x <4 x i32>] %b, 0
- %2 = extractvalue [3 x <4 x i32>] %b, 1
- %3 = extractvalue [3 x <4 x i32>] %b, 2
- %4 = bitcast i32* %a to i8*
- tail call void @llvm.aarch64.neon.vst1x3.v4i32(i8* %4, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3, i32 4)
- ret void
-}
-
-define void @test_vst1q_s64_x3(i64* %a, [3 x <2 x i64>] %b) {
-; CHECK-LABEL: test_vst1q_s64_x3
-; CHECK: st1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d},
-; [{{x[0-9]+|sp}}]
- %1 = extractvalue [3 x <2 x i64>] %b, 0
- %2 = extractvalue [3 x <2 x i64>] %b, 1
- %3 = extractvalue [3 x <2 x i64>] %b, 2
- %4 = bitcast i64* %a to i8*
- tail call void @llvm.aarch64.neon.vst1x3.v2i64(i8* %4, <2 x i64> %1, <2 x i64> %2, <2 x i64> %3, i32 8)
- ret void
-}
-
-define void @test_vst1q_f32_x3(float* %a, [3 x <4 x float>] %b) {
-; CHECK-LABEL: test_vst1q_f32_x3
-; CHECK: st1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s},
-; [{{x[0-9]+|sp}}]
- %1 = extractvalue [3 x <4 x float>] %b, 0
- %2 = extractvalue [3 x <4 x float>] %b, 1
- %3 = extractvalue [3 x <4 x float>] %b, 2
- %4 = bitcast float* %a to i8*
- tail call void @llvm.aarch64.neon.vst1x3.v4f32(i8* %4, <4 x float> %1, <4 x float> %2, <4 x float> %3, i32 4)
- ret void
-}
-
-define void @test_vst1q_f64_x3(double* %a, [3 x <2 x double>] %b) {
-; CHECK-LABEL: test_vst1q_f64_x3
-; CHECK: st1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d},
-; [{{x[0-9]+|sp}}]
- %1 = extractvalue [3 x <2 x double>] %b, 0
- %2 = extractvalue [3 x <2 x double>] %b, 1
- %3 = extractvalue [3 x <2 x double>] %b, 2
- %4 = bitcast double* %a to i8*
- tail call void @llvm.aarch64.neon.vst1x3.v2f64(i8* %4, <2 x double> %1, <2 x double> %2, <2 x double> %3, i32 8)
- ret void
-}
-
-define void @test_vst1_s8_x3(i8* %a, [3 x <8 x i8>] %b) {
-; CHECK-LABEL: test_vst1_s8_x3
-; CHECK: st1 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b},
-; [{{x[0-9]+|sp}}]
- %1 = extractvalue [3 x <8 x i8>] %b, 0
- %2 = extractvalue [3 x <8 x i8>] %b, 1
- %3 = extractvalue [3 x <8 x i8>] %b, 2
- tail call void @llvm.aarch64.neon.vst1x3.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, <8 x i8> %3, i32 1)
- ret void
-}
-
-define void @test_vst1_s16_x3(i16* %a, [3 x <4 x i16>] %b) {
-; CHECK-LABEL: test_vst1_s16_x3
-; CHECK: st1 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h},
-; [{{x[0-9]+|sp}}]
- %1 = extractvalue [3 x <4 x i16>] %b, 0
- %2 = extractvalue [3 x <4 x i16>] %b, 1
- %3 = extractvalue [3 x <4 x i16>] %b, 2
- %4 = bitcast i16* %a to i8*
- tail call void @llvm.aarch64.neon.vst1x3.v4i16(i8* %4, <4 x i16> %1, <4 x i16> %2, <4 x i16> %3, i32 2)
- ret void
-}
-
-define void @test_vst1_s32_x3(i32* %a, [3 x <2 x i32>] %b) {
-; CHECK-LABEL: test_vst1_s32_x3
-; CHECK: st1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s},
-; [{{x[0-9]+|sp}}]
- %1 = extractvalue [3 x <2 x i32>] %b, 0
- %2 = extractvalue [3 x <2 x i32>] %b, 1
- %3 = extractvalue [3 x <2 x i32>] %b, 2
- %4 = bitcast i32* %a to i8*
- tail call void @llvm.aarch64.neon.vst1x3.v2i32(i8* %4, <2 x i32> %1, <2 x i32> %2, <2 x i32> %3, i32 4)
- ret void
-}
-
-define void @test_vst1_s64_x3(i64* %a, [3 x <1 x i64>] %b) {
-; CHECK-LABEL: test_vst1_s64_x3
-; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d},
-; [{{x[0-9]+|sp}}]
- %1 = extractvalue [3 x <1 x i64>] %b, 0
- %2 = extractvalue [3 x <1 x i64>] %b, 1
- %3 = extractvalue [3 x <1 x i64>] %b, 2
- %4 = bitcast i64* %a to i8*
- tail call void @llvm.aarch64.neon.vst1x3.v1i64(i8* %4, <1 x i64> %1, <1 x i64> %2, <1 x i64> %3, i32 8)
- ret void
-}
-
-define void @test_vst1_f32_x3(float* %a, [3 x <2 x float>] %b) {
-; CHECK-LABEL: test_vst1_f32_x3
-; CHECK: st1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s},
-; [{{x[0-9]+|sp}}]
- %1 = extractvalue [3 x <2 x float>] %b, 0
- %2 = extractvalue [3 x <2 x float>] %b, 1
- %3 = extractvalue [3 x <2 x float>] %b, 2
- %4 = bitcast float* %a to i8*
- tail call void @llvm.aarch64.neon.vst1x3.v2f32(i8* %4, <2 x float> %1, <2 x float> %2, <2 x float> %3, i32 4)
- ret void
-}
-
-define void @test_vst1_f64_x3(double* %a, [3 x <1 x double>] %b) {
-; CHECK-LABEL: test_vst1_f64_x3
-; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d},
-; [{{x[0-9]+|sp}}]
- %1 = extractvalue [3 x <1 x double>] %b, 0
- %2 = extractvalue [3 x <1 x double>] %b, 1
- %3 = extractvalue [3 x <1 x double>] %b, 2
- %4 = bitcast double* %a to i8*
- tail call void @llvm.aarch64.neon.vst1x3.v1f64(i8* %4, <1 x double> %1, <1 x double> %2, <1 x double> %3, i32 8)
- ret void
-}
-
-define void @test_vst1q_s8_x4(i8* %a, [4 x <16 x i8>] %b) {
-; CHECK-LABEL: test_vst1q_s8_x4
-; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b,
-; v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
- %1 = extractvalue [4 x <16 x i8>] %b, 0
- %2 = extractvalue [4 x <16 x i8>] %b, 1
- %3 = extractvalue [4 x <16 x i8>] %b, 2
- %4 = extractvalue [4 x <16 x i8>] %b, 3
- tail call void @llvm.aarch64.neon.vst1x4.v16i8(i8* %a, <16 x i8> %1, <16 x i8> %2, <16 x i8> %3, <16 x i8> %4, i32 1)
- ret void
-}
-
-define void @test_vst1q_s16_x4(i16* %a, [4 x <8 x i16>] %b) {
-; CHECK-LABEL: test_vst1q_s16_x4
-; CHECK: st1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h,
-; v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
- %1 = extractvalue [4 x <8 x i16>] %b, 0
- %2 = extractvalue [4 x <8 x i16>] %b, 1
- %3 = extractvalue [4 x <8 x i16>] %b, 2
- %4 = extractvalue [4 x <8 x i16>] %b, 3
- %5 = bitcast i16* %a to i8*
- tail call void @llvm.aarch64.neon.vst1x4.v8i16(i8* %5, <8 x i16> %1, <8 x i16> %2, <8 x i16> %3, <8 x i16> %4, i32 2)
- ret void
-}
-
-define void @test_vst1q_s32_x4(i32* %a, [4 x <4 x i32>] %b) {
-; CHECK-LABEL: test_vst1q_s32_x4
-; CHECK: st1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s,
-; v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
- %1 = extractvalue [4 x <4 x i32>] %b, 0
- %2 = extractvalue [4 x <4 x i32>] %b, 1
- %3 = extractvalue [4 x <4 x i32>] %b, 2
- %4 = extractvalue [4 x <4 x i32>] %b, 3
- %5 = bitcast i32* %a to i8*
- tail call void @llvm.aarch64.neon.vst1x4.v4i32(i8* %5, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3, <4 x i32> %4, i32 4)
- ret void
-}
-
-define void @test_vst1q_s64_x4(i64* %a, [4 x <2 x i64>] %b) {
-; CHECK-LABEL: test_vst1q_s64_x4
-; CHECK: st1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d,
-; v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
- %1 = extractvalue [4 x <2 x i64>] %b, 0
- %2 = extractvalue [4 x <2 x i64>] %b, 1
- %3 = extractvalue [4 x <2 x i64>] %b, 2
- %4 = extractvalue [4 x <2 x i64>] %b, 3
- %5 = bitcast i64* %a to i8*
- tail call void @llvm.aarch64.neon.vst1x4.v2i64(i8* %5, <2 x i64> %1, <2 x i64> %2, <2 x i64> %3, <2 x i64> %4, i32 8)
- ret void
-}
-
-define void @test_vst1q_f32_x4(float* %a, [4 x <4 x float>] %b) {
-; CHECK-LABEL: test_vst1q_f32_x4
-; CHECK: st1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s,
-; v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
- %1 = extractvalue [4 x <4 x float>] %b, 0
- %2 = extractvalue [4 x <4 x float>] %b, 1
- %3 = extractvalue [4 x <4 x float>] %b, 2
- %4 = extractvalue [4 x <4 x float>] %b, 3
- %5 = bitcast float* %a to i8*
- tail call void @llvm.aarch64.neon.vst1x4.v4f32(i8* %5, <4 x float> %1, <4 x float> %2, <4 x float> %3, <4 x float> %4, i32 4)
- ret void
-}
-
-define void @test_vst1q_f64_x4(double* %a, [4 x <2 x double>] %b) {
-; CHECK-LABEL: test_vst1q_f64_x4
-; CHECK: st1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d,
-; v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
- %1 = extractvalue [4 x <2 x double>] %b, 0
- %2 = extractvalue [4 x <2 x double>] %b, 1
- %3 = extractvalue [4 x <2 x double>] %b, 2
- %4 = extractvalue [4 x <2 x double>] %b, 3
- %5 = bitcast double* %a to i8*
- tail call void @llvm.aarch64.neon.vst1x4.v2f64(i8* %5, <2 x double> %1, <2 x double> %2, <2 x double> %3, <2 x double> %4, i32 8)
- ret void
-}
-
-define void @test_vst1_s8_x4(i8* %a, [4 x <8 x i8>] %b) {
-; CHECK-LABEL: test_vst1_s8_x4
-; CHECK: st1 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b,
-; v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
- %1 = extractvalue [4 x <8 x i8>] %b, 0
- %2 = extractvalue [4 x <8 x i8>] %b, 1
- %3 = extractvalue [4 x <8 x i8>] %b, 2
- %4 = extractvalue [4 x <8 x i8>] %b, 3
- tail call void @llvm.aarch64.neon.vst1x4.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, <8 x i8> %3, <8 x i8> %4, i32 1)
- ret void
-}
-
-define void @test_vst1_s16_x4(i16* %a, [4 x <4 x i16>] %b) {
-; CHECK-LABEL: test_vst1_s16_x4
-; CHECK: st1 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h,
-; v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
- %1 = extractvalue [4 x <4 x i16>] %b, 0
- %2 = extractvalue [4 x <4 x i16>] %b, 1
- %3 = extractvalue [4 x <4 x i16>] %b, 2
- %4 = extractvalue [4 x <4 x i16>] %b, 3
- %5 = bitcast i16* %a to i8*
- tail call void @llvm.aarch64.neon.vst1x4.v4i16(i8* %5, <4 x i16> %1, <4 x i16> %2, <4 x i16> %3, <4 x i16> %4, i32 2)
- ret void
-}
-
-define void @test_vst1_s32_x4(i32* %a, [4 x <2 x i32>] %b) {
-; CHECK-LABEL: test_vst1_s32_x4
-; CHECK: st1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s,
-; v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
- %1 = extractvalue [4 x <2 x i32>] %b, 0
- %2 = extractvalue [4 x <2 x i32>] %b, 1
- %3 = extractvalue [4 x <2 x i32>] %b, 2
- %4 = extractvalue [4 x <2 x i32>] %b, 3
- %5 = bitcast i32* %a to i8*
- tail call void @llvm.aarch64.neon.vst1x4.v2i32(i8* %5, <2 x i32> %1, <2 x i32> %2, <2 x i32> %3, <2 x i32> %4, i32 4)
- ret void
-}
-
-define void @test_vst1_s64_x4(i64* %a, [4 x <1 x i64>] %b) {
-; CHECK-LABEL: test_vst1_s64_x4
-; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d,
-; v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
- %1 = extractvalue [4 x <1 x i64>] %b, 0
- %2 = extractvalue [4 x <1 x i64>] %b, 1
- %3 = extractvalue [4 x <1 x i64>] %b, 2
- %4 = extractvalue [4 x <1 x i64>] %b, 3
- %5 = bitcast i64* %a to i8*
- tail call void @llvm.aarch64.neon.vst1x4.v1i64(i8* %5, <1 x i64> %1, <1 x i64> %2, <1 x i64> %3, <1 x i64> %4, i32 8)
- ret void
-}
-
-define void @test_vst1_f32_x4(float* %a, [4 x <2 x float>] %b) {
-; CHECK-LABEL: test_vst1_f32_x4
-; CHECK: st1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s,
-; v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
- %1 = extractvalue [4 x <2 x float>] %b, 0
- %2 = extractvalue [4 x <2 x float>] %b, 1
- %3 = extractvalue [4 x <2 x float>] %b, 2
- %4 = extractvalue [4 x <2 x float>] %b, 3
- %5 = bitcast float* %a to i8*
- tail call void @llvm.aarch64.neon.vst1x4.v2f32(i8* %5, <2 x float> %1, <2 x float> %2, <2 x float> %3, <2 x float> %4, i32 4)
- ret void
-}
-
-define void @test_vst1_f64_x4(double* %a, [4 x <1 x double>] %b) {
-; CHECK-LABEL: test_vst1_f64_x4
-; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d,
-; v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
- %1 = extractvalue [4 x <1 x double>] %b, 0
- %2 = extractvalue [4 x <1 x double>] %b, 1
- %3 = extractvalue [4 x <1 x double>] %b, 2
- %4 = extractvalue [4 x <1 x double>] %b, 3
- %5 = bitcast double* %a to i8*
- tail call void @llvm.aarch64.neon.vst1x4.v1f64(i8* %5, <1 x double> %1, <1 x double> %2, <1 x double> %3, <1 x double> %4, i32 8)
- ret void
-}
-
-declare { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x2.v16i8(i8*, i32)
-declare { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x2.v8i16(i8*, i32)
-declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.vld1x2.v4i32(i8*, i32)
-declare { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x2.v2i64(i8*, i32)
-declare { <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x2.v4f32(i8*, i32)
-declare { <2 x double>, <2 x double> } @llvm.aarch64.neon.vld1x2.v2f64(i8*, i32)
-declare { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x2.v8i8(i8*, i32)
-declare { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.vld1x2.v4i16(i8*, i32)
-declare { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.vld1x2.v2i32(i8*, i32)
-declare { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.vld1x2.v1i64(i8*, i32)
-declare { <2 x float>, <2 x float> } @llvm.aarch64.neon.vld1x2.v2f32(i8*, i32)
-declare { <1 x double>, <1 x double> } @llvm.aarch64.neon.vld1x2.v1f64(i8*, i32)
-declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x3.v16i8(i8*, i32)
-declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x3.v8i16(i8*, i32)
-declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.vld1x3.v4i32(i8*, i32)
-declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x3.v2i64(i8*, i32)
-declare { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x3.v4f32(i8*, i32)
-declare { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.vld1x3.v2f64(i8*, i32)
-declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x3.v8i8(i8*, i32)
-declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.vld1x3.v4i16(i8*, i32)
-declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.vld1x3.v2i32(i8*, i32)
-declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.vld1x3.v1i64(i8*, i32)
-declare { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.vld1x3.v2f32(i8*, i32)
-declare { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.vld1x3.v1f64(i8*, i32)
-declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x4.v16i8(i8*, i32)
-declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x4.v8i16(i8*, i32)
-declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.vld1x4.v4i32(i8*, i32)
-declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x4.v2i64(i8*, i32)
-declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x4.v4f32(i8*, i32)
-declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.vld1x4.v2f64(i8*, i32)
-declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x4.v8i8(i8*, i32)
-declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.vld1x4.v4i16(i8*, i32)
-declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.vld1x4.v2i32(i8*, i32)
-declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.vld1x4.v1i64(i8*, i32)
-declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.vld1x4.v2f32(i8*, i32)
-declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.vld1x4.v1f64(i8*, i32)
-declare void @llvm.aarch64.neon.vst1x2.v16i8(i8*, <16 x i8>, <16 x i8>, i32)
-declare void @llvm.aarch64.neon.vst1x2.v8i16(i8*, <8 x i16>, <8 x i16>, i32)
-declare void @llvm.aarch64.neon.vst1x2.v4i32(i8*, <4 x i32>, <4 x i32>, i32)
-declare void @llvm.aarch64.neon.vst1x2.v2i64(i8*, <2 x i64>, <2 x i64>, i32)
-declare void @llvm.aarch64.neon.vst1x2.v4f32(i8*, <4 x float>, <4 x float>, i32)
-declare void @llvm.aarch64.neon.vst1x2.v2f64(i8*, <2 x double>, <2 x double>, i32)
-declare void @llvm.aarch64.neon.vst1x2.v8i8(i8*, <8 x i8>, <8 x i8>, i32)
-declare void @llvm.aarch64.neon.vst1x2.v4i16(i8*, <4 x i16>, <4 x i16>, i32)
-declare void @llvm.aarch64.neon.vst1x2.v2i32(i8*, <2 x i32>, <2 x i32>, i32)
-declare void @llvm.aarch64.neon.vst1x2.v1i64(i8*, <1 x i64>, <1 x i64>, i32)
-declare void @llvm.aarch64.neon.vst1x2.v2f32(i8*, <2 x float>, <2 x float>, i32)
-declare void @llvm.aarch64.neon.vst1x2.v1f64(i8*, <1 x double>, <1 x double>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32)
diff --git a/test/CodeGen/AArch64/neon-simd-ldst-one.ll b/test/CodeGen/AArch64/neon-simd-ldst-one.ll
deleted file mode 100644
index 927c933..0000000
--- a/test/CodeGen/AArch64/neon-simd-ldst-one.ll
+++ /dev/null
@@ -1,2299 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-%struct.uint8x16x2_t = type { [2 x <16 x i8>] }
-%struct.poly8x16x2_t = type { [2 x <16 x i8>] }
-%struct.uint8x16x3_t = type { [3 x <16 x i8>] }
-%struct.int8x16x2_t = type { [2 x <16 x i8>] }
-%struct.int16x8x2_t = type { [2 x <8 x i16>] }
-%struct.int32x4x2_t = type { [2 x <4 x i32>] }
-%struct.int64x2x2_t = type { [2 x <2 x i64>] }
-%struct.float32x4x2_t = type { [2 x <4 x float>] }
-%struct.float64x2x2_t = type { [2 x <2 x double>] }
-%struct.int8x8x2_t = type { [2 x <8 x i8>] }
-%struct.int16x4x2_t = type { [2 x <4 x i16>] }
-%struct.int32x2x2_t = type { [2 x <2 x i32>] }
-%struct.int64x1x2_t = type { [2 x <1 x i64>] }
-%struct.float32x2x2_t = type { [2 x <2 x float>] }
-%struct.float64x1x2_t = type { [2 x <1 x double>] }
-%struct.int8x16x3_t = type { [3 x <16 x i8>] }
-%struct.int16x8x3_t = type { [3 x <8 x i16>] }
-%struct.int32x4x3_t = type { [3 x <4 x i32>] }
-%struct.int64x2x3_t = type { [3 x <2 x i64>] }
-%struct.float32x4x3_t = type { [3 x <4 x float>] }
-%struct.float64x2x3_t = type { [3 x <2 x double>] }
-%struct.int8x8x3_t = type { [3 x <8 x i8>] }
-%struct.int16x4x3_t = type { [3 x <4 x i16>] }
-%struct.int32x2x3_t = type { [3 x <2 x i32>] }
-%struct.int64x1x3_t = type { [3 x <1 x i64>] }
-%struct.float32x2x3_t = type { [3 x <2 x float>] }
-%struct.float64x1x3_t = type { [3 x <1 x double>] }
-%struct.int8x16x4_t = type { [4 x <16 x i8>] }
-%struct.int16x8x4_t = type { [4 x <8 x i16>] }
-%struct.int32x4x4_t = type { [4 x <4 x i32>] }
-%struct.int64x2x4_t = type { [4 x <2 x i64>] }
-%struct.float32x4x4_t = type { [4 x <4 x float>] }
-%struct.float64x2x4_t = type { [4 x <2 x double>] }
-%struct.int8x8x4_t = type { [4 x <8 x i8>] }
-%struct.int16x4x4_t = type { [4 x <4 x i16>] }
-%struct.int32x2x4_t = type { [4 x <2 x i32>] }
-%struct.int64x1x4_t = type { [4 x <1 x i64>] }
-%struct.float32x2x4_t = type { [4 x <2 x float>] }
-%struct.float64x1x4_t = type { [4 x <1 x double>] }
-
-define <16 x i8> @test_ld_from_poll_v16i8(<16 x i8> %a) {
-; CHECK-LABEL: test_ld_from_poll_v16i8
-; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
-; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
-entry:
- %b = add <16 x i8> %a, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 2, i8 13, i8 14, i8 15, i8 16>
- ret <16 x i8> %b
-}
-
-define <8 x i16> @test_ld_from_poll_v8i16(<8 x i16> %a) {
-; CHECK-LABEL: test_ld_from_poll_v8i16
-; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
-; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
-entry:
- %b = add <8 x i16> %a, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
- ret <8 x i16> %b
-}
-
-define <4 x i32> @test_ld_from_poll_v4i32(<4 x i32> %a) {
-; CHECK-LABEL: test_ld_from_poll_v4i32
-; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
-; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
-entry:
- %b = add <4 x i32> %a, <i32 1, i32 2, i32 3, i32 4>
- ret <4 x i32> %b
-}
-
-define <2 x i64> @test_ld_from_poll_v2i64(<2 x i64> %a) {
-; CHECK-LABEL: test_ld_from_poll_v2i64
-; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
-; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
-entry:
- %b = add <2 x i64> %a, <i64 1, i64 2>
- ret <2 x i64> %b
-}
-
-define <4 x float> @test_ld_from_poll_v4f32(<4 x float> %a) {
-; CHECK-LABEL: test_ld_from_poll_v4f32
-; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
-; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
-entry:
- %b = fadd <4 x float> %a, <float 1.0, float 2.0, float 3.0, float 4.0>
- ret <4 x float> %b
-}
-
-define <2 x double> @test_ld_from_poll_v2f64(<2 x double> %a) {
-; CHECK-LABEL: test_ld_from_poll_v2f64
-; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
-; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
-entry:
- %b = fadd <2 x double> %a, <double 1.0, double 2.0>
- ret <2 x double> %b
-}
-
-define <8 x i8> @test_ld_from_poll_v8i8(<8 x i8> %a) {
-; CHECK-LABEL: test_ld_from_poll_v8i8
-; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
-; CHECK-NEXT: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
-entry:
- %b = add <8 x i8> %a, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>
- ret <8 x i8> %b
-}
-
-define <4 x i16> @test_ld_from_poll_v4i16(<4 x i16> %a) {
-; CHECK-LABEL: test_ld_from_poll_v4i16
-; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
-; CHECK-NEXT: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
-entry:
- %b = add <4 x i16> %a, <i16 1, i16 2, i16 3, i16 4>
- ret <4 x i16> %b
-}
-
-define <2 x i32> @test_ld_from_poll_v2i32(<2 x i32> %a) {
-; CHECK-LABEL: test_ld_from_poll_v2i32
-; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
-; CHECK-NEXT: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
-entry:
- %b = add <2 x i32> %a, <i32 1, i32 2>
- ret <2 x i32> %b
-}
-
-define <16 x i8> @test_vld1q_dup_s8(i8* %a) {
-; CHECK-LABEL: test_vld1q_dup_s8
-; CHECK: ld1r {{{v[0-9]+}}.16b}, [x0]
-entry:
- %0 = load i8* %a, align 1
- %1 = insertelement <16 x i8> undef, i8 %0, i32 0
- %lane = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
- ret <16 x i8> %lane
-}
-
-define <8 x i16> @test_vld1q_dup_s16(i16* %a) {
-; CHECK-LABEL: test_vld1q_dup_s16
-; CHECK: ld1r {{{v[0-9]+}}.8h}, [x0]
-entry:
- %0 = load i16* %a, align 2
- %1 = insertelement <8 x i16> undef, i16 %0, i32 0
- %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
- ret <8 x i16> %lane
-}
-
-define <4 x i32> @test_vld1q_dup_s32(i32* %a) {
-; CHECK-LABEL: test_vld1q_dup_s32
-; CHECK: ld1r {{{v[0-9]+}}.4s}, [x0]
-entry:
- %0 = load i32* %a, align 4
- %1 = insertelement <4 x i32> undef, i32 %0, i32 0
- %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
- ret <4 x i32> %lane
-}
-
-define <2 x i64> @test_vld1q_dup_s64(i64* %a) {
-; CHECK-LABEL: test_vld1q_dup_s64
-; CHECK: ld1r {{{v[0-9]+}}.2d}, [x0]
-entry:
- %0 = load i64* %a, align 8
- %1 = insertelement <2 x i64> undef, i64 %0, i32 0
- %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
- ret <2 x i64> %lane
-}
-
-define <4 x float> @test_vld1q_dup_f32(float* %a) {
-; CHECK-LABEL: test_vld1q_dup_f32
-; CHECK: ld1r {{{v[0-9]+}}.4s}, [x0]
-entry:
- %0 = load float* %a, align 4
- %1 = insertelement <4 x float> undef, float %0, i32 0
- %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
- ret <4 x float> %lane
-}
-
-define <2 x double> @test_vld1q_dup_f64(double* %a) {
-; CHECK-LABEL: test_vld1q_dup_f64
-; CHECK: ld1r {{{v[0-9]+}}.2d}, [x0]
-entry:
- %0 = load double* %a, align 8
- %1 = insertelement <2 x double> undef, double %0, i32 0
- %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
- ret <2 x double> %lane
-}
-
-define <8 x i8> @test_vld1_dup_s8(i8* %a) {
-; CHECK-LABEL: test_vld1_dup_s8
-; CHECK: ld1r {{{v[0-9]+}}.8b}, [x0]
-entry:
- %0 = load i8* %a, align 1
- %1 = insertelement <8 x i8> undef, i8 %0, i32 0
- %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
- ret <8 x i8> %lane
-}
-
-define <4 x i16> @test_vld1_dup_s16(i16* %a) {
-; CHECK-LABEL: test_vld1_dup_s16
-; CHECK: ld1r {{{v[0-9]+}}.4h}, [x0]
-entry:
- %0 = load i16* %a, align 2
- %1 = insertelement <4 x i16> undef, i16 %0, i32 0
- %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
- ret <4 x i16> %lane
-}
-
-define <2 x i32> @test_vld1_dup_s32(i32* %a) {
-; CHECK-LABEL: test_vld1_dup_s32
-; CHECK: ld1r {{{v[0-9]+}}.2s}, [x0]
-entry:
- %0 = load i32* %a, align 4
- %1 = insertelement <2 x i32> undef, i32 %0, i32 0
- %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
- ret <2 x i32> %lane
-}
-
-define <1 x i64> @test_vld1_dup_s64(i64* %a) {
-; CHECK-LABEL: test_vld1_dup_s64
-; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
-entry:
- %0 = load i64* %a, align 8
- %1 = insertelement <1 x i64> undef, i64 %0, i32 0
- ret <1 x i64> %1
-}
-
-define <2 x float> @test_vld1_dup_f32(float* %a) {
-; CHECK-LABEL: test_vld1_dup_f32
-; CHECK: ld1r {{{v[0-9]+}}.2s}, [x0]
-entry:
- %0 = load float* %a, align 4
- %1 = insertelement <2 x float> undef, float %0, i32 0
- %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
- ret <2 x float> %lane
-}
-
-define <1 x double> @test_vld1_dup_f64(double* %a) {
-; CHECK-LABEL: test_vld1_dup_f64
-; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
-entry:
- %0 = load double* %a, align 8
- %1 = insertelement <1 x double> undef, double %0, i32 0
- ret <1 x double> %1
-}
-
-define <1 x i64> @testDUP.v1i64(i64* %a, i64* %b) #0 {
-; As there is a store operation depending on %1, LD1R pattern can't be selected.
-; So LDR and FMOV should be emitted.
-; CHECK-LABEL: testDUP.v1i64
-; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}]
-; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
-; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}]
- %1 = load i64* %a, align 8
- store i64 %1, i64* %b, align 8
- %vecinit.i = insertelement <1 x i64> undef, i64 %1, i32 0
- ret <1 x i64> %vecinit.i
-}
-
-define <1 x double> @testDUP.v1f64(double* %a, double* %b) #0 {
-; As there is a store operation depending on %1, LD1R pattern can't be selected.
-; So LDR and FMOV should be emitted.
-; CHECK-LABEL: testDUP.v1f64
-; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}]
-; CHECK: str {{d[0-9]+}}, [{{x[0-9]+}}]
- %1 = load double* %a, align 8
- store double %1, double* %b, align 8
- %vecinit.i = insertelement <1 x double> undef, double %1, i32 0
- ret <1 x double> %vecinit.i
-}
-
-define %struct.int8x16x2_t @test_vld2q_dup_s8(i8* %a) {
-; CHECK-LABEL: test_vld2q_dup_s8
-; CHECK: ld2r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0]
-entry:
- %vld_dup = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
- %0 = extractvalue { <16 x i8>, <16 x i8> } %vld_dup, 0
- %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
- %1 = extractvalue { <16 x i8>, <16 x i8> } %vld_dup, 1
- %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
- %.fca.0.0.insert = insertvalue %struct.int8x16x2_t undef, <16 x i8> %lane, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int8x16x2_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1
- ret %struct.int8x16x2_t %.fca.0.1.insert
-}
-
-define %struct.int16x8x2_t @test_vld2q_dup_s16(i16* %a) {
-; CHECK-LABEL: test_vld2q_dup_s16
-; CHECK: ld2r {{{v[0-9]+}}.8h, {{v[0-9]+}}.8h}, [x0]
-entry:
- %0 = bitcast i16* %a to i8*
- %vld_dup = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2)
- %1 = extractvalue { <8 x i16>, <8 x i16> } %vld_dup, 0
- %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
- %2 = extractvalue { <8 x i16>, <8 x i16> } %vld_dup, 1
- %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
- %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %lane, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1
- ret %struct.int16x8x2_t %.fca.0.1.insert
-}
-
-define %struct.int32x4x2_t @test_vld2q_dup_s32(i32* %a) {
-; CHECK-LABEL: test_vld2q_dup_s32
-; CHECK: ld2r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
-entry:
- %0 = bitcast i32* %a to i8*
- %vld_dup = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
- %1 = extractvalue { <4 x i32>, <4 x i32> } %vld_dup, 0
- %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
- %2 = extractvalue { <4 x i32>, <4 x i32> } %vld_dup, 1
- %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
- %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %lane, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1
- ret %struct.int32x4x2_t %.fca.0.1.insert
-}
-
-define %struct.int64x2x2_t @test_vld2q_dup_s64(i64* %a) {
-; CHECK-LABEL: test_vld2q_dup_s64
-; CHECK: ld2r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
-entry:
- %0 = bitcast i64* %a to i8*
- %vld_dup = tail call { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8)
- %1 = extractvalue { <2 x i64>, <2 x i64> } %vld_dup, 0
- %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
- %2 = extractvalue { <2 x i64>, <2 x i64> } %vld_dup, 1
- %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
- %.fca.0.0.insert = insertvalue %struct.int64x2x2_t undef, <2 x i64> %lane, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int64x2x2_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1
- ret %struct.int64x2x2_t %.fca.0.1.insert
-}
-
-define %struct.float32x4x2_t @test_vld2q_dup_f32(float* %a) {
-; CHECK-LABEL: test_vld2q_dup_f32
-; CHECK: ld2r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
-entry:
- %0 = bitcast float* %a to i8*
- %vld_dup = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, i32 0, i32 4)
- %1 = extractvalue { <4 x float>, <4 x float> } %vld_dup, 0
- %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
- %2 = extractvalue { <4 x float>, <4 x float> } %vld_dup, 1
- %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
- %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %lane, 0, 0
- %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1
- ret %struct.float32x4x2_t %.fca.0.1.insert
-}
-
-define %struct.float64x2x2_t @test_vld2q_dup_f64(double* %a) {
-; CHECK-LABEL: test_vld2q_dup_f64
-; CHECK: ld2r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
-entry:
- %0 = bitcast double* %a to i8*
- %vld_dup = tail call { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
- %1 = extractvalue { <2 x double>, <2 x double> } %vld_dup, 0
- %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
- %2 = extractvalue { <2 x double>, <2 x double> } %vld_dup, 1
- %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
- %.fca.0.0.insert = insertvalue %struct.float64x2x2_t undef, <2 x double> %lane, 0, 0
- %.fca.0.1.insert = insertvalue %struct.float64x2x2_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1
- ret %struct.float64x2x2_t %.fca.0.1.insert
-}
-
-define %struct.int8x8x2_t @test_vld2_dup_s8(i8* %a) {
-; CHECK-LABEL: test_vld2_dup_s8
-; CHECK: ld2r {{{v[0-9]+}}.8b, {{v[0-9]+}}.8b}, [x0]
-entry:
- %vld_dup = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
- %0 = extractvalue { <8 x i8>, <8 x i8> } %vld_dup, 0
- %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer
- %1 = extractvalue { <8 x i8>, <8 x i8> } %vld_dup, 1
- %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
- %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %lane, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1
- ret %struct.int8x8x2_t %.fca.0.1.insert
-}
-
-define %struct.int16x4x2_t @test_vld2_dup_s16(i16* %a) {
-; CHECK-LABEL: test_vld2_dup_s16
-; CHECK: ld2r {{{v[0-9]+}}.4h, {{v[0-9]+}}.4h}, [x0]
-entry:
- %0 = bitcast i16* %a to i8*
- %vld_dup = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
- %1 = extractvalue { <4 x i16>, <4 x i16> } %vld_dup, 0
- %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
- %2 = extractvalue { <4 x i16>, <4 x i16> } %vld_dup, 1
- %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer
- %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %lane, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1
- ret %struct.int16x4x2_t %.fca.0.1.insert
-}
-
-define %struct.int32x2x2_t @test_vld2_dup_s32(i32* %a) {
-; CHECK-LABEL: test_vld2_dup_s32
-; CHECK: ld2r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
-entry:
- %0 = bitcast i32* %a to i8*
- %vld_dup = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
- %1 = extractvalue { <2 x i32>, <2 x i32> } %vld_dup, 0
- %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
- %2 = extractvalue { <2 x i32>, <2 x i32> } %vld_dup, 1
- %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
- %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %lane, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1
- ret %struct.int32x2x2_t %.fca.0.1.insert
-}
-
-define %struct.int64x1x2_t @test_vld2_dup_s64(i64* %a) {
-; CHECK-LABEL: test_vld2_dup_s64
-; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
-entry:
- %0 = bitcast i64* %a to i8*
- %vld_dup = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8* %0, i32 8)
- %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld_dup, 0
- %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld_dup, 1
- %.fca.0.0.insert = insertvalue %struct.int64x1x2_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int64x1x2_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1
- ret %struct.int64x1x2_t %.fca.0.1.insert
-}
-
-define %struct.float32x2x2_t @test_vld2_dup_f32(float* %a) {
-; CHECK-LABEL: test_vld2_dup_f32
-; CHECK: ld2r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
-entry:
- %0 = bitcast float* %a to i8*
- %vld_dup = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
- %1 = extractvalue { <2 x float>, <2 x float> } %vld_dup, 0
- %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
- %2 = extractvalue { <2 x float>, <2 x float> } %vld_dup, 1
- %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer
- %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %lane, 0, 0
- %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1
- ret %struct.float32x2x2_t %.fca.0.1.insert
-}
-
-define %struct.float64x1x2_t @test_vld2_dup_f64(double* %a) {
-; CHECK-LABEL: test_vld2_dup_f64
-; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
-entry:
- %0 = bitcast double* %a to i8*
- %vld_dup = tail call { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8* %0, i32 8)
- %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double> } %vld_dup, 0
- %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double> } %vld_dup, 1
- %.fca.0.0.insert = insertvalue %struct.float64x1x2_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.float64x1x2_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1
- ret %struct.float64x1x2_t %.fca.0.1.insert
-}
-
-define %struct.int8x16x3_t @test_vld3q_dup_s8(i8* %a) {
-; CHECK-LABEL: test_vld3q_dup_s8
-; CHECK: ld3r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0]
-entry:
- %vld_dup = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
- %0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 0
- %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
- %1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 1
- %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
- %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 2
- %lane2 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
- %.fca.0.0.insert = insertvalue %struct.int8x16x3_t undef, <16 x i8> %lane, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int8x16x3_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int8x16x3_t %.fca.0.1.insert, <16 x i8> %lane2, 0, 2
- ret %struct.int8x16x3_t %.fca.0.2.insert
-}
-
-define %struct.int16x8x3_t @test_vld3q_dup_s16(i16* %a) {
-; CHECK-LABEL: test_vld3q_dup_s16
-; CHECK: ld3r {{{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h}, [x0]
-entry:
- %0 = bitcast i16* %a to i8*
- %vld_dup = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2)
- %1 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 0
- %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
- %2 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 1
- %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
- %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 2
- %lane2 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> zeroinitializer
- %.fca.0.0.insert = insertvalue %struct.int16x8x3_t undef, <8 x i16> %lane, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int16x8x3_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int16x8x3_t %.fca.0.1.insert, <8 x i16> %lane2, 0, 2
- ret %struct.int16x8x3_t %.fca.0.2.insert
-}
-
-define %struct.int32x4x3_t @test_vld3q_dup_s32(i32* %a) {
-; CHECK-LABEL: test_vld3q_dup_s32
-; CHECK: ld3r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
-entry:
- %0 = bitcast i32* %a to i8*
- %vld_dup = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
- %1 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 0
- %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
- %2 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 1
- %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
- %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 2
- %lane2 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
- %.fca.0.0.insert = insertvalue %struct.int32x4x3_t undef, <4 x i32> %lane, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int32x4x3_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int32x4x3_t %.fca.0.1.insert, <4 x i32> %lane2, 0, 2
- ret %struct.int32x4x3_t %.fca.0.2.insert
-}
-
-define %struct.int64x2x3_t @test_vld3q_dup_s64(i64* %a) {
-; CHECK-LABEL: test_vld3q_dup_s64
-; CHECK: ld3r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
-entry:
- %0 = bitcast i64* %a to i8*
- %vld_dup = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8)
- %1 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 0
- %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
- %2 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 1
- %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
- %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 2
- %lane2 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer
- %.fca.0.0.insert = insertvalue %struct.int64x2x3_t undef, <2 x i64> %lane, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int64x2x3_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int64x2x3_t %.fca.0.1.insert, <2 x i64> %lane2, 0, 2
- ret %struct.int64x2x3_t %.fca.0.2.insert
-}
-
-define %struct.float32x4x3_t @test_vld3q_dup_f32(float* %a) {
-; CHECK-LABEL: test_vld3q_dup_f32
-; CHECK: ld3r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
-entry:
- %0 = bitcast float* %a to i8*
- %vld_dup = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, <4 x float> undef, i32 0, i32 4)
- %1 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 0
- %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
- %2 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 1
- %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
- %3 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 2
- %lane2 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> zeroinitializer
- %.fca.0.0.insert = insertvalue %struct.float32x4x3_t undef, <4 x float> %lane, 0, 0
- %.fca.0.1.insert = insertvalue %struct.float32x4x3_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1
- %.fca.0.2.insert = insertvalue %struct.float32x4x3_t %.fca.0.1.insert, <4 x float> %lane2, 0, 2
- ret %struct.float32x4x3_t %.fca.0.2.insert
-}
-
-define %struct.float64x2x3_t @test_vld3q_dup_f64(double* %a) {
-; CHECK-LABEL: test_vld3q_dup_f64
-; CHECK: ld3r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
-entry:
- %0 = bitcast double* %a to i8*
- %vld_dup = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
- %1 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 0
- %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
- %2 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 1
- %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
- %3 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 2
- %lane2 = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> zeroinitializer
- %.fca.0.0.insert = insertvalue %struct.float64x2x3_t undef, <2 x double> %lane, 0, 0
- %.fca.0.1.insert = insertvalue %struct.float64x2x3_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1
- %.fca.0.2.insert = insertvalue %struct.float64x2x3_t %.fca.0.1.insert, <2 x double> %lane2, 0, 2
- ret %struct.float64x2x3_t %.fca.0.2.insert
-}
-
-define %struct.int8x8x3_t @test_vld3_dup_s8(i8* %a) {
-; CHECK-LABEL: test_vld3_dup_s8
-; CHECK: ld3r {{{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b}, [x0]
-entry:
- %vld_dup = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
- %0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 0
- %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer
- %1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 1
- %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
- %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 2
- %lane2 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
- %.fca.0.0.insert = insertvalue %struct.int8x8x3_t undef, <8 x i8> %lane, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int8x8x3_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int8x8x3_t %.fca.0.1.insert, <8 x i8> %lane2, 0, 2
- ret %struct.int8x8x3_t %.fca.0.2.insert
-}
-
-define %struct.int16x4x3_t @test_vld3_dup_s16(i16* %a) {
-; CHECK-LABEL: test_vld3_dup_s16
-; CHECK: ld3r {{{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h}, [x0]
-entry:
- %0 = bitcast i16* %a to i8*
- %vld_dup = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
- %1 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 0
- %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
- %2 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 1
- %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer
- %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 2
- %lane2 = shufflevector <4 x i16> %3, <4 x i16> undef, <4 x i32> zeroinitializer
- %.fca.0.0.insert = insertvalue %struct.int16x4x3_t undef, <4 x i16> %lane, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int16x4x3_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int16x4x3_t %.fca.0.1.insert, <4 x i16> %lane2, 0, 2
- ret %struct.int16x4x3_t %.fca.0.2.insert
-}
-
-define %struct.int32x2x3_t @test_vld3_dup_s32(i32* %a) {
-; CHECK-LABEL: test_vld3_dup_s32
-; CHECK: ld3r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
-entry:
- %0 = bitcast i32* %a to i8*
- %vld_dup = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
- %1 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 0
- %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
- %2 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 1
- %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
- %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 2
- %lane2 = shufflevector <2 x i32> %3, <2 x i32> undef, <2 x i32> zeroinitializer
- %.fca.0.0.insert = insertvalue %struct.int32x2x3_t undef, <2 x i32> %lane, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int32x2x3_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int32x2x3_t %.fca.0.1.insert, <2 x i32> %lane2, 0, 2
- ret %struct.int32x2x3_t %.fca.0.2.insert
-}
-
-define %struct.int64x1x3_t @test_vld3_dup_s64(i64* %a) {
-; CHECK-LABEL: test_vld3_dup_s64
-; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
-entry:
- %0 = bitcast i64* %a to i8*
- %vld_dup = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8* %0, i32 8)
- %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 0
- %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 1
- %vld_dup.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 2
- %.fca.0.0.insert = insertvalue %struct.int64x1x3_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int64x1x3_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int64x1x3_t %.fca.0.1.insert, <1 x i64> %vld_dup.fca.2.extract, 0, 2
- ret %struct.int64x1x3_t %.fca.0.2.insert
-}
-
-define %struct.float32x2x3_t @test_vld3_dup_f32(float* %a) {
-; CHECK-LABEL: test_vld3_dup_f32
-; CHECK: ld3r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
-entry:
- %0 = bitcast float* %a to i8*
- %vld_dup = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
- %1 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 0
- %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
- %2 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 1
- %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer
- %3 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 2
- %lane2 = shufflevector <2 x float> %3, <2 x float> undef, <2 x i32> zeroinitializer
- %.fca.0.0.insert = insertvalue %struct.float32x2x3_t undef, <2 x float> %lane, 0, 0
- %.fca.0.1.insert = insertvalue %struct.float32x2x3_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1
- %.fca.0.2.insert = insertvalue %struct.float32x2x3_t %.fca.0.1.insert, <2 x float> %lane2, 0, 2
- ret %struct.float32x2x3_t %.fca.0.2.insert
-}
-
-define %struct.float64x1x3_t @test_vld3_dup_f64(double* %a) {
-; CHECK-LABEL: test_vld3_dup_f64
-; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
-entry:
- %0 = bitcast double* %a to i8*
- %vld_dup = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8* %0, i32 8)
- %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 0
- %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 1
- %vld_dup.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 2
- %.fca.0.0.insert = insertvalue %struct.float64x1x3_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.float64x1x3_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.float64x1x3_t %.fca.0.1.insert, <1 x double> %vld_dup.fca.2.extract, 0, 2
- ret %struct.float64x1x3_t %.fca.0.2.insert
-}
-
-define %struct.int8x16x4_t @test_vld4q_dup_s8(i8* %a) {
-; CHECK-LABEL: test_vld4q_dup_s8
-; CHECK: ld4r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0]
-entry:
- %vld_dup = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
- %0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 0
- %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
- %1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 1
- %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
- %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 2
- %lane2 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
- %3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 3
- %lane3 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> zeroinitializer
- %.fca.0.0.insert = insertvalue %struct.int8x16x4_t undef, <16 x i8> %lane, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int8x16x4_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int8x16x4_t %.fca.0.1.insert, <16 x i8> %lane2, 0, 2
- %.fca.0.3.insert = insertvalue %struct.int8x16x4_t %.fca.0.2.insert, <16 x i8> %lane3, 0, 3
- ret %struct.int8x16x4_t %.fca.0.3.insert
-}
-
-define %struct.int16x8x4_t @test_vld4q_dup_s16(i16* %a) {
-; CHECK-LABEL: test_vld4q_dup_s16
-; CHECK: ld4r {{{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h}, [x0]
-entry:
- %0 = bitcast i16* %a to i8*
- %vld_dup = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2)
- %1 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 0
- %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
- %2 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 1
- %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
- %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 2
- %lane2 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> zeroinitializer
- %4 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 3
- %lane3 = shufflevector <8 x i16> %4, <8 x i16> undef, <8 x i32> zeroinitializer
- %.fca.0.0.insert = insertvalue %struct.int16x8x4_t undef, <8 x i16> %lane, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int16x8x4_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int16x8x4_t %.fca.0.1.insert, <8 x i16> %lane2, 0, 2
- %.fca.0.3.insert = insertvalue %struct.int16x8x4_t %.fca.0.2.insert, <8 x i16> %lane3, 0, 3
- ret %struct.int16x8x4_t %.fca.0.3.insert
-}
-
-define %struct.int32x4x4_t @test_vld4q_dup_s32(i32* %a) {
-; CHECK-LABEL: test_vld4q_dup_s32
-; CHECK: ld4r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
-entry:
- %0 = bitcast i32* %a to i8*
- %vld_dup = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
- %1 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 0
- %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
- %2 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 1
- %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
- %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 2
- %lane2 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
- %4 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 3
- %lane3 = shufflevector <4 x i32> %4, <4 x i32> undef, <4 x i32> zeroinitializer
- %.fca.0.0.insert = insertvalue %struct.int32x4x4_t undef, <4 x i32> %lane, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int32x4x4_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int32x4x4_t %.fca.0.1.insert, <4 x i32> %lane2, 0, 2
- %.fca.0.3.insert = insertvalue %struct.int32x4x4_t %.fca.0.2.insert, <4 x i32> %lane3, 0, 3
- ret %struct.int32x4x4_t %.fca.0.3.insert
-}
-
-define %struct.int64x2x4_t @test_vld4q_dup_s64(i64* %a) {
-; CHECK-LABEL: test_vld4q_dup_s64
-; CHECK: ld4r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
-entry:
- %0 = bitcast i64* %a to i8*
- %vld_dup = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8)
- %1 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 0
- %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
- %2 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 1
- %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
- %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 2
- %lane2 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer
- %4 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 3
- %lane3 = shufflevector <2 x i64> %4, <2 x i64> undef, <2 x i32> zeroinitializer
- %.fca.0.0.insert = insertvalue %struct.int64x2x4_t undef, <2 x i64> %lane, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int64x2x4_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int64x2x4_t %.fca.0.1.insert, <2 x i64> %lane2, 0, 2
- %.fca.0.3.insert = insertvalue %struct.int64x2x4_t %.fca.0.2.insert, <2 x i64> %lane3, 0, 3
- ret %struct.int64x2x4_t %.fca.0.3.insert
-}
-
-define %struct.float32x4x4_t @test_vld4q_dup_f32(float* %a) {
-; CHECK-LABEL: test_vld4q_dup_f32
-; CHECK: ld4r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
-entry:
- %0 = bitcast float* %a to i8*
- %vld_dup = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, <4 x float> undef, <4 x float> undef, i32 0, i32 4)
- %1 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 0
- %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
- %2 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 1
- %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
- %3 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 2
- %lane2 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> zeroinitializer
- %4 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 3
- %lane3 = shufflevector <4 x float> %4, <4 x float> undef, <4 x i32> zeroinitializer
- %.fca.0.0.insert = insertvalue %struct.float32x4x4_t undef, <4 x float> %lane, 0, 0
- %.fca.0.1.insert = insertvalue %struct.float32x4x4_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1
- %.fca.0.2.insert = insertvalue %struct.float32x4x4_t %.fca.0.1.insert, <4 x float> %lane2, 0, 2
- %.fca.0.3.insert = insertvalue %struct.float32x4x4_t %.fca.0.2.insert, <4 x float> %lane3, 0, 3
- ret %struct.float32x4x4_t %.fca.0.3.insert
-}
-
-define %struct.float64x2x4_t @test_vld4q_dup_f64(double* %a) {
-; CHECK-LABEL: test_vld4q_dup_f64
-; CHECK: ld4r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
-entry:
- %0 = bitcast double* %a to i8*
- %vld_dup = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
- %1 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 0
- %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
- %2 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 1
- %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
- %3 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 2
- %lane2 = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> zeroinitializer
- %4 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 3
- %lane3 = shufflevector <2 x double> %4, <2 x double> undef, <2 x i32> zeroinitializer
- %.fca.0.0.insert = insertvalue %struct.float64x2x4_t undef, <2 x double> %lane, 0, 0
- %.fca.0.1.insert = insertvalue %struct.float64x2x4_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1
- %.fca.0.2.insert = insertvalue %struct.float64x2x4_t %.fca.0.1.insert, <2 x double> %lane2, 0, 2
- %.fca.0.3.insert = insertvalue %struct.float64x2x4_t %.fca.0.2.insert, <2 x double> %lane3, 0, 3
- ret %struct.float64x2x4_t %.fca.0.3.insert
-}
-
-define %struct.int8x8x4_t @test_vld4_dup_s8(i8* %a) {
-; CHECK-LABEL: test_vld4_dup_s8
-; CHECK: ld4r {{{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b}, [x0]
-entry:
- %vld_dup = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
- %0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 0
- %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer
- %1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 1
- %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
- %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 2
- %lane2 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
- %3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 3
- %lane3 = shufflevector <8 x i8> %3, <8 x i8> undef, <8 x i32> zeroinitializer
- %.fca.0.0.insert = insertvalue %struct.int8x8x4_t undef, <8 x i8> %lane, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int8x8x4_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int8x8x4_t %.fca.0.1.insert, <8 x i8> %lane2, 0, 2
- %.fca.0.3.insert = insertvalue %struct.int8x8x4_t %.fca.0.2.insert, <8 x i8> %lane3, 0, 3
- ret %struct.int8x8x4_t %.fca.0.3.insert
-}
-
-define %struct.int16x4x4_t @test_vld4_dup_s16(i16* %a) {
-; CHECK-LABEL: test_vld4_dup_s16
-; CHECK: ld4r {{{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h}, [x0]
-entry:
- %0 = bitcast i16* %a to i8*
- %vld_dup = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
- %1 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 0
- %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
- %2 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 1
- %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer
- %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 2
- %lane2 = shufflevector <4 x i16> %3, <4 x i16> undef, <4 x i32> zeroinitializer
- %4 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 3
- %lane3 = shufflevector <4 x i16> %4, <4 x i16> undef, <4 x i32> zeroinitializer
- %.fca.0.0.insert = insertvalue %struct.int16x4x4_t undef, <4 x i16> %lane, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int16x4x4_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int16x4x4_t %.fca.0.1.insert, <4 x i16> %lane2, 0, 2
- %.fca.0.3.insert = insertvalue %struct.int16x4x4_t %.fca.0.2.insert, <4 x i16> %lane3, 0, 3
- ret %struct.int16x4x4_t %.fca.0.3.insert
-}
-
-define %struct.int32x2x4_t @test_vld4_dup_s32(i32* %a) {
-; CHECK-LABEL: test_vld4_dup_s32
-; CHECK: ld4r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
-entry:
- %0 = bitcast i32* %a to i8*
- %vld_dup = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
- %1 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 0
- %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
- %2 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 1
- %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
- %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 2
- %lane2 = shufflevector <2 x i32> %3, <2 x i32> undef, <2 x i32> zeroinitializer
- %4 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 3
- %lane3 = shufflevector <2 x i32> %4, <2 x i32> undef, <2 x i32> zeroinitializer
- %.fca.0.0.insert = insertvalue %struct.int32x2x4_t undef, <2 x i32> %lane, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int32x2x4_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int32x2x4_t %.fca.0.1.insert, <2 x i32> %lane2, 0, 2
- %.fca.0.3.insert = insertvalue %struct.int32x2x4_t %.fca.0.2.insert, <2 x i32> %lane3, 0, 3
- ret %struct.int32x2x4_t %.fca.0.3.insert
-}
-
-define %struct.int64x1x4_t @test_vld4_dup_s64(i64* %a) {
-; CHECK-LABEL: test_vld4_dup_s64
-; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
-entry:
- %0 = bitcast i64* %a to i8*
- %vld_dup = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8* %0, i32 8)
- %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 0
- %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 1
- %vld_dup.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 2
- %vld_dup.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 3
- %.fca.0.0.insert = insertvalue %struct.int64x1x4_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int64x1x4_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int64x1x4_t %.fca.0.1.insert, <1 x i64> %vld_dup.fca.2.extract, 0, 2
- %.fca.0.3.insert = insertvalue %struct.int64x1x4_t %.fca.0.2.insert, <1 x i64> %vld_dup.fca.3.extract, 0, 3
- ret %struct.int64x1x4_t %.fca.0.3.insert
-}
-
-define %struct.float32x2x4_t @test_vld4_dup_f32(float* %a) {
-; CHECK-LABEL: test_vld4_dup_f32
-; CHECK: ld4r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
-entry:
- %0 = bitcast float* %a to i8*
- %vld_dup = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
- %1 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 0
- %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
- %2 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 1
- %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer
- %3 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 2
- %lane2 = shufflevector <2 x float> %3, <2 x float> undef, <2 x i32> zeroinitializer
- %4 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 3
- %lane3 = shufflevector <2 x float> %4, <2 x float> undef, <2 x i32> zeroinitializer
- %.fca.0.0.insert = insertvalue %struct.float32x2x4_t undef, <2 x float> %lane, 0, 0
- %.fca.0.1.insert = insertvalue %struct.float32x2x4_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1
- %.fca.0.2.insert = insertvalue %struct.float32x2x4_t %.fca.0.1.insert, <2 x float> %lane2, 0, 2
- %.fca.0.3.insert = insertvalue %struct.float32x2x4_t %.fca.0.2.insert, <2 x float> %lane3, 0, 3
- ret %struct.float32x2x4_t %.fca.0.3.insert
-}
-
-define %struct.float64x1x4_t @test_vld4_dup_f64(double* %a) {
-; CHECK-LABEL: test_vld4_dup_f64
-; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
-entry:
- %0 = bitcast double* %a to i8*
- %vld_dup = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8* %0, i32 8)
- %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 0
- %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 1
- %vld_dup.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 2
- %vld_dup.fca.3.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 3
- %.fca.0.0.insert = insertvalue %struct.float64x1x4_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.float64x1x4_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.float64x1x4_t %.fca.0.1.insert, <1 x double> %vld_dup.fca.2.extract, 0, 2
- %.fca.0.3.insert = insertvalue %struct.float64x1x4_t %.fca.0.2.insert, <1 x double> %vld_dup.fca.3.extract, 0, 3
- ret %struct.float64x1x4_t %.fca.0.3.insert
-}
-
-define <16 x i8> @test_vld1q_lane_s8(i8* %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vld1q_lane_s8
-; CHECK: ld1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
-entry:
- %0 = load i8* %a, align 1
- %vld1_lane = insertelement <16 x i8> %b, i8 %0, i32 15
- ret <16 x i8> %vld1_lane
-}
-
-define <8 x i16> @test_vld1q_lane_s16(i16* %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vld1q_lane_s16
-; CHECK: ld1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
-entry:
- %0 = load i16* %a, align 2
- %vld1_lane = insertelement <8 x i16> %b, i16 %0, i32 7
- ret <8 x i16> %vld1_lane
-}
-
-define <4 x i32> @test_vld1q_lane_s32(i32* %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vld1q_lane_s32
-; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
- %0 = load i32* %a, align 4
- %vld1_lane = insertelement <4 x i32> %b, i32 %0, i32 3
- ret <4 x i32> %vld1_lane
-}
-
-define <2 x i64> @test_vld1q_lane_s64(i64* %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vld1q_lane_s64
-; CHECK: ld1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
- %0 = load i64* %a, align 8
- %vld1_lane = insertelement <2 x i64> %b, i64 %0, i32 1
- ret <2 x i64> %vld1_lane
-}
-
-define <4 x float> @test_vld1q_lane_f32(float* %a, <4 x float> %b) {
-; CHECK-LABEL: test_vld1q_lane_f32
-; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
- %0 = load float* %a, align 4
- %vld1_lane = insertelement <4 x float> %b, float %0, i32 3
- ret <4 x float> %vld1_lane
-}
-
-define <2 x double> @test_vld1q_lane_f64(double* %a, <2 x double> %b) {
-; CHECK-LABEL: test_vld1q_lane_f64
-; CHECK: ld1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
- %0 = load double* %a, align 8
- %vld1_lane = insertelement <2 x double> %b, double %0, i32 1
- ret <2 x double> %vld1_lane
-}
-
-define <8 x i8> @test_vld1_lane_s8(i8* %a, <8 x i8> %b) {
-; CHECK-LABEL: test_vld1_lane_s8
-; CHECK: ld1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
-entry:
- %0 = load i8* %a, align 1
- %vld1_lane = insertelement <8 x i8> %b, i8 %0, i32 7
- ret <8 x i8> %vld1_lane
-}
-
-define <4 x i16> @test_vld1_lane_s16(i16* %a, <4 x i16> %b) {
-; CHECK-LABEL: test_vld1_lane_s16
-; CHECK: ld1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
-entry:
- %0 = load i16* %a, align 2
- %vld1_lane = insertelement <4 x i16> %b, i16 %0, i32 3
- ret <4 x i16> %vld1_lane
-}
-
-define <2 x i32> @test_vld1_lane_s32(i32* %a, <2 x i32> %b) {
-; CHECK-LABEL: test_vld1_lane_s32
-; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
- %0 = load i32* %a, align 4
- %vld1_lane = insertelement <2 x i32> %b, i32 %0, i32 1
- ret <2 x i32> %vld1_lane
-}
-
-define <1 x i64> @test_vld1_lane_s64(i64* %a, <1 x i64> %b) {
-; CHECK-LABEL: test_vld1_lane_s64
-; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
-entry:
- %0 = load i64* %a, align 8
- %vld1_lane = insertelement <1 x i64> undef, i64 %0, i32 0
- ret <1 x i64> %vld1_lane
-}
-
-define <2 x float> @test_vld1_lane_f32(float* %a, <2 x float> %b) {
-; CHECK-LABEL: test_vld1_lane_f32
-; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
- %0 = load float* %a, align 4
- %vld1_lane = insertelement <2 x float> %b, float %0, i32 1
- ret <2 x float> %vld1_lane
-}
-
-define <1 x double> @test_vld1_lane_f64(double* %a, <1 x double> %b) {
-; CHECK-LABEL: test_vld1_lane_f64
-; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
-entry:
- %0 = load double* %a, align 8
- %vld1_lane = insertelement <1 x double> undef, double %0, i32 0
- ret <1 x double> %vld1_lane
-}
-
-define %struct.int16x8x2_t @test_vld2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vld2q_lane_s16
-; CHECK: ld2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [2 x <8 x i16>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [2 x <8 x i16>] %b.coerce, 1
- %0 = bitcast i16* %a to i8*
- %vld2_lane = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, i32 7, i32 2)
- %vld2_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2_lane, 0
- %vld2_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2_lane, 1
- %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %vld2_lane.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %vld2_lane.fca.1.extract, 0, 1
- ret %struct.int16x8x2_t %.fca.0.1.insert
-}
-
-define %struct.int32x4x2_t @test_vld2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vld2q_lane_s32
-; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %b.coerce, 1
- %0 = bitcast i32* %a to i8*
- %vld2_lane = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, i32 3, i32 4)
- %vld2_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2_lane, 0
- %vld2_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2_lane, 1
- %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %vld2_lane.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %vld2_lane.fca.1.extract, 0, 1
- ret %struct.int32x4x2_t %.fca.0.1.insert
-}
-
-define %struct.int64x2x2_t @test_vld2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vld2q_lane_s64
-; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [2 x <2 x i64>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [2 x <2 x i64>] %b.coerce, 1
- %0 = bitcast i64* %a to i8*
- %vld2_lane = tail call { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, i32 1, i32 8)
- %vld2_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2_lane, 0
- %vld2_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2_lane, 1
- %.fca.0.0.insert = insertvalue %struct.int64x2x2_t undef, <2 x i64> %vld2_lane.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int64x2x2_t %.fca.0.0.insert, <2 x i64> %vld2_lane.fca.1.extract, 0, 1
- ret %struct.int64x2x2_t %.fca.0.1.insert
-}
-
-define %struct.float32x4x2_t @test_vld2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) {
-; CHECK-LABEL: test_vld2q_lane_f32
-; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [2 x <4 x float>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [2 x <4 x float>] %b.coerce, 1
- %0 = bitcast float* %a to i8*
- %vld2_lane = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, i32 3, i32 4)
- %vld2_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float> } %vld2_lane, 0
- %vld2_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float> } %vld2_lane, 1
- %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %vld2_lane.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %vld2_lane.fca.1.extract, 0, 1
- ret %struct.float32x4x2_t %.fca.0.1.insert
-}
-
-define %struct.float64x2x2_t @test_vld2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) {
-; CHECK-LABEL: test_vld2q_lane_f64
-; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [2 x <2 x double>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [2 x <2 x double>] %b.coerce, 1
- %0 = bitcast double* %a to i8*
- %vld2_lane = tail call { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, i32 1, i32 8)
- %vld2_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double> } %vld2_lane, 0
- %vld2_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double> } %vld2_lane, 1
- %.fca.0.0.insert = insertvalue %struct.float64x2x2_t undef, <2 x double> %vld2_lane.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.float64x2x2_t %.fca.0.0.insert, <2 x double> %vld2_lane.fca.1.extract, 0, 1
- ret %struct.float64x2x2_t %.fca.0.1.insert
-}
-
-define %struct.int8x8x2_t @test_vld2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vld2_lane_s8
-; CHECK: ld2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %b.coerce, 1
- %vld2_lane = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, i32 7, i32 1)
- %vld2_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2_lane, 0
- %vld2_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2_lane, 1
- %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %vld2_lane.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %vld2_lane.fca.1.extract, 0, 1
- ret %struct.int8x8x2_t %.fca.0.1.insert
-}
-
-define %struct.int16x4x2_t @test_vld2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vld2_lane_s16
-; CHECK: ld2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [2 x <4 x i16>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [2 x <4 x i16>] %b.coerce, 1
- %0 = bitcast i16* %a to i8*
- %vld2_lane = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, i32 3, i32 2)
- %vld2_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2_lane, 0
- %vld2_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2_lane, 1
- %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %vld2_lane.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %vld2_lane.fca.1.extract, 0, 1
- ret %struct.int16x4x2_t %.fca.0.1.insert
-}
-
-define %struct.int32x2x2_t @test_vld2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vld2_lane_s32
-; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %b.coerce, 1
- %0 = bitcast i32* %a to i8*
- %vld2_lane = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, i32 1, i32 4)
- %vld2_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2_lane, 0
- %vld2_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2_lane, 1
- %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %vld2_lane.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %vld2_lane.fca.1.extract, 0, 1
- ret %struct.int32x2x2_t %.fca.0.1.insert
-}
-
-define %struct.int64x1x2_t @test_vld2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vld2_lane_s64
-; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [2 x <1 x i64>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [2 x <1 x i64>] %b.coerce, 1
- %0 = bitcast i64* %a to i8*
- %vld2_lane = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, i32 0, i32 8)
- %vld2_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2_lane, 0
- %vld2_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2_lane, 1
- %.fca.0.0.insert = insertvalue %struct.int64x1x2_t undef, <1 x i64> %vld2_lane.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int64x1x2_t %.fca.0.0.insert, <1 x i64> %vld2_lane.fca.1.extract, 0, 1
- ret %struct.int64x1x2_t %.fca.0.1.insert
-}
-
-define %struct.float32x2x2_t @test_vld2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) {
-; CHECK-LABEL: test_vld2_lane_f32
-; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [2 x <2 x float>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [2 x <2 x float>] %b.coerce, 1
- %0 = bitcast float* %a to i8*
- %vld2_lane = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, i32 1, i32 4)
- %vld2_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float> } %vld2_lane, 0
- %vld2_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float> } %vld2_lane, 1
- %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %vld2_lane.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %vld2_lane.fca.1.extract, 0, 1
- ret %struct.float32x2x2_t %.fca.0.1.insert
-}
-
-define %struct.float64x1x2_t @test_vld2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) {
-; CHECK-LABEL: test_vld2_lane_f64
-; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [2 x <1 x double>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [2 x <1 x double>] %b.coerce, 1
- %0 = bitcast double* %a to i8*
- %vld2_lane = tail call { <1 x double>, <1 x double> } @llvm.arm.neon.vld2lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, i32 0, i32 8)
- %vld2_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double> } %vld2_lane, 0
- %vld2_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double> } %vld2_lane, 1
- %.fca.0.0.insert = insertvalue %struct.float64x1x2_t undef, <1 x double> %vld2_lane.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.float64x1x2_t %.fca.0.0.insert, <1 x double> %vld2_lane.fca.1.extract, 0, 1
- ret %struct.float64x1x2_t %.fca.0.1.insert
-}
-
-define %struct.int16x8x3_t @test_vld3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vld3q_lane_s16
-; CHECK: ld3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [3 x <8 x i16>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [3 x <8 x i16>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [3 x <8 x i16>] %b.coerce, 2
- %0 = bitcast i16* %a to i8*
- %vld3_lane = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, i32 7, i32 2)
- %vld3_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 0
- %vld3_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 1
- %vld3_lane.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 2
- %.fca.0.0.insert = insertvalue %struct.int16x8x3_t undef, <8 x i16> %vld3_lane.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int16x8x3_t %.fca.0.0.insert, <8 x i16> %vld3_lane.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int16x8x3_t %.fca.0.1.insert, <8 x i16> %vld3_lane.fca.2.extract, 0, 2
- ret %struct.int16x8x3_t %.fca.0.2.insert
-}
-
-define %struct.int32x4x3_t @test_vld3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vld3q_lane_s32
-; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %b.coerce, 2
- %0 = bitcast i32* %a to i8*
- %vld3_lane = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, i32 3, i32 4)
- %vld3_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 0
- %vld3_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 1
- %vld3_lane.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 2
- %.fca.0.0.insert = insertvalue %struct.int32x4x3_t undef, <4 x i32> %vld3_lane.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int32x4x3_t %.fca.0.0.insert, <4 x i32> %vld3_lane.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int32x4x3_t %.fca.0.1.insert, <4 x i32> %vld3_lane.fca.2.extract, 0, 2
- ret %struct.int32x4x3_t %.fca.0.2.insert
-}
-
-define %struct.int64x2x3_t @test_vld3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vld3q_lane_s64
-; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [3 x <2 x i64>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [3 x <2 x i64>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [3 x <2 x i64>] %b.coerce, 2
- %0 = bitcast i64* %a to i8*
- %vld3_lane = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, i32 1, i32 8)
- %vld3_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 0
- %vld3_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 1
- %vld3_lane.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 2
- %.fca.0.0.insert = insertvalue %struct.int64x2x3_t undef, <2 x i64> %vld3_lane.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int64x2x3_t %.fca.0.0.insert, <2 x i64> %vld3_lane.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int64x2x3_t %.fca.0.1.insert, <2 x i64> %vld3_lane.fca.2.extract, 0, 2
- ret %struct.int64x2x3_t %.fca.0.2.insert
-}
-
-define %struct.float32x4x3_t @test_vld3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) {
-; CHECK-LABEL: test_vld3q_lane_f32
-; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [3 x <4 x float>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [3 x <4 x float>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [3 x <4 x float>] %b.coerce, 2
- %0 = bitcast float* %a to i8*
- %vld3_lane = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, i32 3, i32 4)
- %vld3_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 0
- %vld3_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 1
- %vld3_lane.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 2
- %.fca.0.0.insert = insertvalue %struct.float32x4x3_t undef, <4 x float> %vld3_lane.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.float32x4x3_t %.fca.0.0.insert, <4 x float> %vld3_lane.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.float32x4x3_t %.fca.0.1.insert, <4 x float> %vld3_lane.fca.2.extract, 0, 2
- ret %struct.float32x4x3_t %.fca.0.2.insert
-}
-
-define %struct.float64x2x3_t @test_vld3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) {
-; CHECK-LABEL: test_vld3q_lane_f64
-; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [3 x <2 x double>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [3 x <2 x double>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [3 x <2 x double>] %b.coerce, 2
- %0 = bitcast double* %a to i8*
- %vld3_lane = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, i32 1, i32 8)
- %vld3_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 0
- %vld3_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 1
- %vld3_lane.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 2
- %.fca.0.0.insert = insertvalue %struct.float64x2x3_t undef, <2 x double> %vld3_lane.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.float64x2x3_t %.fca.0.0.insert, <2 x double> %vld3_lane.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.float64x2x3_t %.fca.0.1.insert, <2 x double> %vld3_lane.fca.2.extract, 0, 2
- ret %struct.float64x2x3_t %.fca.0.2.insert
-}
-
-define %struct.int8x8x3_t @test_vld3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vld3_lane_s8
-; CHECK: ld3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [3 x <8 x i8>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [3 x <8 x i8>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [3 x <8 x i8>] %b.coerce, 2
- %vld3_lane = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, i32 7, i32 1)
- %vld3_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 0
- %vld3_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 1
- %vld3_lane.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 2
- %.fca.0.0.insert = insertvalue %struct.int8x8x3_t undef, <8 x i8> %vld3_lane.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int8x8x3_t %.fca.0.0.insert, <8 x i8> %vld3_lane.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int8x8x3_t %.fca.0.1.insert, <8 x i8> %vld3_lane.fca.2.extract, 0, 2
- ret %struct.int8x8x3_t %.fca.0.2.insert
-}
-
-define %struct.int16x4x3_t @test_vld3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vld3_lane_s16
-; CHECK: ld3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [3 x <4 x i16>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [3 x <4 x i16>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [3 x <4 x i16>] %b.coerce, 2
- %0 = bitcast i16* %a to i8*
- %vld3_lane = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, i32 3, i32 2)
- %vld3_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 0
- %vld3_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 1
- %vld3_lane.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 2
- %.fca.0.0.insert = insertvalue %struct.int16x4x3_t undef, <4 x i16> %vld3_lane.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int16x4x3_t %.fca.0.0.insert, <4 x i16> %vld3_lane.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int16x4x3_t %.fca.0.1.insert, <4 x i16> %vld3_lane.fca.2.extract, 0, 2
- ret %struct.int16x4x3_t %.fca.0.2.insert
-}
-
-define %struct.int32x2x3_t @test_vld3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vld3_lane_s32
-; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %b.coerce, 2
- %0 = bitcast i32* %a to i8*
- %vld3_lane = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, i32 1, i32 4)
- %vld3_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 0
- %vld3_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 1
- %vld3_lane.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 2
- %.fca.0.0.insert = insertvalue %struct.int32x2x3_t undef, <2 x i32> %vld3_lane.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int32x2x3_t %.fca.0.0.insert, <2 x i32> %vld3_lane.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int32x2x3_t %.fca.0.1.insert, <2 x i32> %vld3_lane.fca.2.extract, 0, 2
- ret %struct.int32x2x3_t %.fca.0.2.insert
-}
-
-define %struct.int64x1x3_t @test_vld3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vld3_lane_s64
-; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [3 x <1 x i64>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [3 x <1 x i64>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [3 x <1 x i64>] %b.coerce, 2
- %0 = bitcast i64* %a to i8*
- %vld3_lane = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, i32 0, i32 8)
- %vld3_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 0
- %vld3_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 1
- %vld3_lane.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 2
- %.fca.0.0.insert = insertvalue %struct.int64x1x3_t undef, <1 x i64> %vld3_lane.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int64x1x3_t %.fca.0.0.insert, <1 x i64> %vld3_lane.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int64x1x3_t %.fca.0.1.insert, <1 x i64> %vld3_lane.fca.2.extract, 0, 2
- ret %struct.int64x1x3_t %.fca.0.2.insert
-}
-
-define %struct.float32x2x3_t @test_vld3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) {
-; CHECK-LABEL: test_vld3_lane_f32
-; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [3 x <2 x float>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [3 x <2 x float>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [3 x <2 x float>] %b.coerce, 2
- %0 = bitcast float* %a to i8*
- %vld3_lane = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, i32 1, i32 4)
- %vld3_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 0
- %vld3_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 1
- %vld3_lane.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 2
- %.fca.0.0.insert = insertvalue %struct.float32x2x3_t undef, <2 x float> %vld3_lane.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.float32x2x3_t %.fca.0.0.insert, <2 x float> %vld3_lane.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.float32x2x3_t %.fca.0.1.insert, <2 x float> %vld3_lane.fca.2.extract, 0, 2
- ret %struct.float32x2x3_t %.fca.0.2.insert
-}
-
-define %struct.float64x1x3_t @test_vld3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) {
-; CHECK-LABEL: test_vld3_lane_f64
-; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [3 x <1 x double>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [3 x <1 x double>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [3 x <1 x double>] %b.coerce, 2
- %0 = bitcast double* %a to i8*
- %vld3_lane = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, i32 0, i32 8)
- %vld3_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 0
- %vld3_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 1
- %vld3_lane.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 2
- %.fca.0.0.insert = insertvalue %struct.float64x1x3_t undef, <1 x double> %vld3_lane.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.float64x1x3_t %.fca.0.0.insert, <1 x double> %vld3_lane.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.float64x1x3_t %.fca.0.1.insert, <1 x double> %vld3_lane.fca.2.extract, 0, 2
- ret %struct.float64x1x3_t %.fca.0.2.insert
-}
-
-define %struct.int8x16x4_t @test_vld4q_lane_s8(i8* %a, [4 x <16 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vld4q_lane_s8
-; CHECK: ld4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [4 x <16 x i8>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [4 x <16 x i8>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [4 x <16 x i8>] %b.coerce, 2
- %b.coerce.fca.3.extract = extractvalue [4 x <16 x i8>] %b.coerce, 3
- %vld3_lane = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, <16 x i8> %b.coerce.fca.3.extract, i32 15, i32 1)
- %vld3_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 0
- %vld3_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 1
- %vld3_lane.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 2
- %vld3_lane.fca.3.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 3
- %.fca.0.0.insert = insertvalue %struct.int8x16x4_t undef, <16 x i8> %vld3_lane.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int8x16x4_t %.fca.0.0.insert, <16 x i8> %vld3_lane.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int8x16x4_t %.fca.0.1.insert, <16 x i8> %vld3_lane.fca.2.extract, 0, 2
- %.fca.0.3.insert = insertvalue %struct.int8x16x4_t %.fca.0.2.insert, <16 x i8> %vld3_lane.fca.3.extract, 0, 3
- ret %struct.int8x16x4_t %.fca.0.3.insert
-}
-
-define %struct.int16x8x4_t @test_vld4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vld4q_lane_s16
-; CHECK: ld4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [4 x <8 x i16>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [4 x <8 x i16>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [4 x <8 x i16>] %b.coerce, 2
- %b.coerce.fca.3.extract = extractvalue [4 x <8 x i16>] %b.coerce, 3
- %0 = bitcast i16* %a to i8*
- %vld3_lane = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, <8 x i16> %b.coerce.fca.3.extract, i32 7, i32 2)
- %vld3_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 0
- %vld3_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 1
- %vld3_lane.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 2
- %vld3_lane.fca.3.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 3
- %.fca.0.0.insert = insertvalue %struct.int16x8x4_t undef, <8 x i16> %vld3_lane.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int16x8x4_t %.fca.0.0.insert, <8 x i16> %vld3_lane.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int16x8x4_t %.fca.0.1.insert, <8 x i16> %vld3_lane.fca.2.extract, 0, 2
- %.fca.0.3.insert = insertvalue %struct.int16x8x4_t %.fca.0.2.insert, <8 x i16> %vld3_lane.fca.3.extract, 0, 3
- ret %struct.int16x8x4_t %.fca.0.3.insert
-}
-
-define %struct.int32x4x4_t @test_vld4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vld4q_lane_s32
-; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %b.coerce, 2
- %b.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %b.coerce, 3
- %0 = bitcast i32* %a to i8*
- %vld3_lane = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, <4 x i32> %b.coerce.fca.3.extract, i32 3, i32 4)
- %vld3_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 0
- %vld3_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 1
- %vld3_lane.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 2
- %vld3_lane.fca.3.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 3
- %.fca.0.0.insert = insertvalue %struct.int32x4x4_t undef, <4 x i32> %vld3_lane.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int32x4x4_t %.fca.0.0.insert, <4 x i32> %vld3_lane.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int32x4x4_t %.fca.0.1.insert, <4 x i32> %vld3_lane.fca.2.extract, 0, 2
- %.fca.0.3.insert = insertvalue %struct.int32x4x4_t %.fca.0.2.insert, <4 x i32> %vld3_lane.fca.3.extract, 0, 3
- ret %struct.int32x4x4_t %.fca.0.3.insert
-}
-
-define %struct.int64x2x4_t @test_vld4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vld4q_lane_s64
-; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [4 x <2 x i64>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [4 x <2 x i64>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [4 x <2 x i64>] %b.coerce, 2
- %b.coerce.fca.3.extract = extractvalue [4 x <2 x i64>] %b.coerce, 3
- %0 = bitcast i64* %a to i8*
- %vld3_lane = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, <2 x i64> %b.coerce.fca.3.extract, i32 1, i32 8)
- %vld3_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 0
- %vld3_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 1
- %vld3_lane.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 2
- %vld3_lane.fca.3.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 3
- %.fca.0.0.insert = insertvalue %struct.int64x2x4_t undef, <2 x i64> %vld3_lane.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int64x2x4_t %.fca.0.0.insert, <2 x i64> %vld3_lane.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int64x2x4_t %.fca.0.1.insert, <2 x i64> %vld3_lane.fca.2.extract, 0, 2
- %.fca.0.3.insert = insertvalue %struct.int64x2x4_t %.fca.0.2.insert, <2 x i64> %vld3_lane.fca.3.extract, 0, 3
- ret %struct.int64x2x4_t %.fca.0.3.insert
-}
-
-define %struct.float32x4x4_t @test_vld4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) {
-; CHECK-LABEL: test_vld4q_lane_f32
-; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [4 x <4 x float>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [4 x <4 x float>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [4 x <4 x float>] %b.coerce, 2
- %b.coerce.fca.3.extract = extractvalue [4 x <4 x float>] %b.coerce, 3
- %0 = bitcast float* %a to i8*
- %vld3_lane = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, <4 x float> %b.coerce.fca.3.extract, i32 3, i32 4)
- %vld3_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 0
- %vld3_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 1
- %vld3_lane.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 2
- %vld3_lane.fca.3.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 3
- %.fca.0.0.insert = insertvalue %struct.float32x4x4_t undef, <4 x float> %vld3_lane.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.float32x4x4_t %.fca.0.0.insert, <4 x float> %vld3_lane.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.float32x4x4_t %.fca.0.1.insert, <4 x float> %vld3_lane.fca.2.extract, 0, 2
- %.fca.0.3.insert = insertvalue %struct.float32x4x4_t %.fca.0.2.insert, <4 x float> %vld3_lane.fca.3.extract, 0, 3
- ret %struct.float32x4x4_t %.fca.0.3.insert
-}
-
-define %struct.float64x2x4_t @test_vld4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) {
-; CHECK-LABEL: test_vld4q_lane_f64
-; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [4 x <2 x double>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [4 x <2 x double>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [4 x <2 x double>] %b.coerce, 2
- %b.coerce.fca.3.extract = extractvalue [4 x <2 x double>] %b.coerce, 3
- %0 = bitcast double* %a to i8*
- %vld3_lane = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, <2 x double> %b.coerce.fca.3.extract, i32 1, i32 8)
- %vld3_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 0
- %vld3_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 1
- %vld3_lane.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 2
- %vld3_lane.fca.3.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 3
- %.fca.0.0.insert = insertvalue %struct.float64x2x4_t undef, <2 x double> %vld3_lane.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.float64x2x4_t %.fca.0.0.insert, <2 x double> %vld3_lane.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.float64x2x4_t %.fca.0.1.insert, <2 x double> %vld3_lane.fca.2.extract, 0, 2
- %.fca.0.3.insert = insertvalue %struct.float64x2x4_t %.fca.0.2.insert, <2 x double> %vld3_lane.fca.3.extract, 0, 3
- ret %struct.float64x2x4_t %.fca.0.3.insert
-}
-
-define %struct.int8x8x4_t @test_vld4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vld4_lane_s8
-; CHECK: ld4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [4 x <8 x i8>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [4 x <8 x i8>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [4 x <8 x i8>] %b.coerce, 2
- %b.coerce.fca.3.extract = extractvalue [4 x <8 x i8>] %b.coerce, 3
- %vld3_lane = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, <8 x i8> %b.coerce.fca.3.extract, i32 7, i32 1)
- %vld3_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 0
- %vld3_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 1
- %vld3_lane.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 2
- %vld3_lane.fca.3.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 3
- %.fca.0.0.insert = insertvalue %struct.int8x8x4_t undef, <8 x i8> %vld3_lane.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int8x8x4_t %.fca.0.0.insert, <8 x i8> %vld3_lane.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int8x8x4_t %.fca.0.1.insert, <8 x i8> %vld3_lane.fca.2.extract, 0, 2
- %.fca.0.3.insert = insertvalue %struct.int8x8x4_t %.fca.0.2.insert, <8 x i8> %vld3_lane.fca.3.extract, 0, 3
- ret %struct.int8x8x4_t %.fca.0.3.insert
-}
-
-define %struct.int16x4x4_t @test_vld4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vld4_lane_s16
-; CHECK: ld4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [4 x <4 x i16>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [4 x <4 x i16>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [4 x <4 x i16>] %b.coerce, 2
- %b.coerce.fca.3.extract = extractvalue [4 x <4 x i16>] %b.coerce, 3
- %0 = bitcast i16* %a to i8*
- %vld3_lane = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, <4 x i16> %b.coerce.fca.3.extract, i32 3, i32 2)
- %vld3_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 0
- %vld3_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 1
- %vld3_lane.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 2
- %vld3_lane.fca.3.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 3
- %.fca.0.0.insert = insertvalue %struct.int16x4x4_t undef, <4 x i16> %vld3_lane.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int16x4x4_t %.fca.0.0.insert, <4 x i16> %vld3_lane.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int16x4x4_t %.fca.0.1.insert, <4 x i16> %vld3_lane.fca.2.extract, 0, 2
- %.fca.0.3.insert = insertvalue %struct.int16x4x4_t %.fca.0.2.insert, <4 x i16> %vld3_lane.fca.3.extract, 0, 3
- ret %struct.int16x4x4_t %.fca.0.3.insert
-}
-
-define %struct.int32x2x4_t @test_vld4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vld4_lane_s32
-; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %b.coerce, 2
- %b.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %b.coerce, 3
- %0 = bitcast i32* %a to i8*
- %vld3_lane = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, <2 x i32> %b.coerce.fca.3.extract, i32 1, i32 4)
- %vld3_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 0
- %vld3_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 1
- %vld3_lane.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 2
- %vld3_lane.fca.3.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 3
- %.fca.0.0.insert = insertvalue %struct.int32x2x4_t undef, <2 x i32> %vld3_lane.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int32x2x4_t %.fca.0.0.insert, <2 x i32> %vld3_lane.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int32x2x4_t %.fca.0.1.insert, <2 x i32> %vld3_lane.fca.2.extract, 0, 2
- %.fca.0.3.insert = insertvalue %struct.int32x2x4_t %.fca.0.2.insert, <2 x i32> %vld3_lane.fca.3.extract, 0, 3
- ret %struct.int32x2x4_t %.fca.0.3.insert
-}
-
-define %struct.int64x1x4_t @test_vld4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vld4_lane_s64
-; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [4 x <1 x i64>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [4 x <1 x i64>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [4 x <1 x i64>] %b.coerce, 2
- %b.coerce.fca.3.extract = extractvalue [4 x <1 x i64>] %b.coerce, 3
- %0 = bitcast i64* %a to i8*
- %vld3_lane = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, <1 x i64> %b.coerce.fca.3.extract, i32 0, i32 8)
- %vld3_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 0
- %vld3_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 1
- %vld3_lane.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 2
- %vld3_lane.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 3
- %.fca.0.0.insert = insertvalue %struct.int64x1x4_t undef, <1 x i64> %vld3_lane.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int64x1x4_t %.fca.0.0.insert, <1 x i64> %vld3_lane.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int64x1x4_t %.fca.0.1.insert, <1 x i64> %vld3_lane.fca.2.extract, 0, 2
- %.fca.0.3.insert = insertvalue %struct.int64x1x4_t %.fca.0.2.insert, <1 x i64> %vld3_lane.fca.3.extract, 0, 3
- ret %struct.int64x1x4_t %.fca.0.3.insert
-}
-
-define %struct.float32x2x4_t @test_vld4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) {
-; CHECK-LABEL: test_vld4_lane_f32
-; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [4 x <2 x float>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [4 x <2 x float>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [4 x <2 x float>] %b.coerce, 2
- %b.coerce.fca.3.extract = extractvalue [4 x <2 x float>] %b.coerce, 3
- %0 = bitcast float* %a to i8*
- %vld3_lane = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, <2 x float> %b.coerce.fca.3.extract, i32 1, i32 4)
- %vld3_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 0
- %vld3_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 1
- %vld3_lane.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 2
- %vld3_lane.fca.3.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 3
- %.fca.0.0.insert = insertvalue %struct.float32x2x4_t undef, <2 x float> %vld3_lane.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.float32x2x4_t %.fca.0.0.insert, <2 x float> %vld3_lane.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.float32x2x4_t %.fca.0.1.insert, <2 x float> %vld3_lane.fca.2.extract, 0, 2
- %.fca.0.3.insert = insertvalue %struct.float32x2x4_t %.fca.0.2.insert, <2 x float> %vld3_lane.fca.3.extract, 0, 3
- ret %struct.float32x2x4_t %.fca.0.3.insert
-}
-
-define %struct.float64x1x4_t @test_vld4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) {
-; CHECK-LABEL: test_vld4_lane_f64
-; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [4 x <1 x double>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [4 x <1 x double>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [4 x <1 x double>] %b.coerce, 2
- %b.coerce.fca.3.extract = extractvalue [4 x <1 x double>] %b.coerce, 3
- %0 = bitcast double* %a to i8*
- %vld3_lane = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, <1 x double> %b.coerce.fca.3.extract, i32 0, i32 8)
- %vld3_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 0
- %vld3_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 1
- %vld3_lane.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 2
- %vld3_lane.fca.3.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 3
- %.fca.0.0.insert = insertvalue %struct.float64x1x4_t undef, <1 x double> %vld3_lane.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.float64x1x4_t %.fca.0.0.insert, <1 x double> %vld3_lane.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.float64x1x4_t %.fca.0.1.insert, <1 x double> %vld3_lane.fca.2.extract, 0, 2
- %.fca.0.3.insert = insertvalue %struct.float64x1x4_t %.fca.0.2.insert, <1 x double> %vld3_lane.fca.3.extract, 0, 3
- ret %struct.float64x1x4_t %.fca.0.3.insert
-}
-
-define void @test_vst1q_lane_s8(i8* %a, <16 x i8> %b) {
-; CHECK-LABEL: test_vst1q_lane_s8
-; CHECK: st1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
-entry:
- %0 = extractelement <16 x i8> %b, i32 15
- store i8 %0, i8* %a, align 1
- ret void
-}
-
-define void @test_vst1q_lane_s16(i16* %a, <8 x i16> %b) {
-; CHECK-LABEL: test_vst1q_lane_s16
-; CHECK: st1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
-entry:
- %0 = extractelement <8 x i16> %b, i32 7
- store i16 %0, i16* %a, align 2
- ret void
-}
-
-define void @test_vst1q_lane_s32(i32* %a, <4 x i32> %b) {
-; CHECK-LABEL: test_vst1q_lane_s32
-; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
- %0 = extractelement <4 x i32> %b, i32 3
- store i32 %0, i32* %a, align 4
- ret void
-}
-
-define void @test_vst1q_lane_s64(i64* %a, <2 x i64> %b) {
-; CHECK-LABEL: test_vst1q_lane_s64
-; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
- %0 = extractelement <2 x i64> %b, i32 1
- store i64 %0, i64* %a, align 8
- ret void
-}
-
-define void @test_vst1q_lane_f32(float* %a, <4 x float> %b) {
-; CHECK-LABEL: test_vst1q_lane_f32
-; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
- %0 = extractelement <4 x float> %b, i32 3
- store float %0, float* %a, align 4
- ret void
-}
-
-define void @test_vst1q_lane_f64(double* %a, <2 x double> %b) {
-; CHECK-LABEL: test_vst1q_lane_f64
-; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
- %0 = extractelement <2 x double> %b, i32 1
- store double %0, double* %a, align 8
- ret void
-}
-
-define void @test_vst1_lane_s8(i8* %a, <8 x i8> %b) {
-; CHECK-LABEL: test_vst1_lane_s8
-; CHECK: st1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
-entry:
- %0 = extractelement <8 x i8> %b, i32 7
- store i8 %0, i8* %a, align 1
- ret void
-}
-
-define void @test_vst1_lane_s16(i16* %a, <4 x i16> %b) {
-; CHECK-LABEL: test_vst1_lane_s16
-; CHECK: st1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
-entry:
- %0 = extractelement <4 x i16> %b, i32 3
- store i16 %0, i16* %a, align 2
- ret void
-}
-
-define void @test_vst1_lane_s32(i32* %a, <2 x i32> %b) {
-; CHECK-LABEL: test_vst1_lane_s32
-; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
- %0 = extractelement <2 x i32> %b, i32 1
- store i32 %0, i32* %a, align 4
- ret void
-}
-
-define void @test_vst1_lane_s64(i64* %a, <1 x i64> %b) {
-; CHECK-LABEL: test_vst1_lane_s64
-; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
- %0 = extractelement <1 x i64> %b, i32 0
- store i64 %0, i64* %a, align 8
- ret void
-}
-
-define void @test_vst1_lane_f32(float* %a, <2 x float> %b) {
-; CHECK-LABEL: test_vst1_lane_f32
-; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
- %0 = extractelement <2 x float> %b, i32 1
- store float %0, float* %a, align 4
- ret void
-}
-
-define void @test_vst1_lane_f64(double* %a, <1 x double> %b) {
-; CHECK-LABEL: test_vst1_lane_f64
-; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
- %0 = extractelement <1 x double> %b, i32 0
- store double %0, double* %a, align 8
- ret void
-}
-
-define void @test_vst2q_lane_s8(i8* %a, [2 x <16 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vst2q_lane_s8
-; CHECK: st2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %b.coerce, 1
- tail call void @llvm.arm.neon.vst2lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, i32 15, i32 1)
- ret void
-}
-
-define void @test_vst2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vst2q_lane_s16
-; CHECK: st2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [2 x <8 x i16>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [2 x <8 x i16>] %b.coerce, 1
- %0 = bitcast i16* %a to i8*
- tail call void @llvm.arm.neon.vst2lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, i32 7, i32 2)
- ret void
-}
-
-define void @test_vst2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vst2q_lane_s32
-; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %b.coerce, 1
- %0 = bitcast i32* %a to i8*
- tail call void @llvm.arm.neon.vst2lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, i32 3, i32 4)
- ret void
-}
-
-define void @test_vst2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vst2q_lane_s64
-; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [2 x <2 x i64>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [2 x <2 x i64>] %b.coerce, 1
- %0 = bitcast i64* %a to i8*
- tail call void @llvm.arm.neon.vst2lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, i32 1, i32 8)
- ret void
-}
-
-define void @test_vst2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) {
-; CHECK-LABEL: test_vst2q_lane_f32
-; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [2 x <4 x float>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [2 x <4 x float>] %b.coerce, 1
- %0 = bitcast float* %a to i8*
- tail call void @llvm.arm.neon.vst2lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, i32 3, i32 4)
- ret void
-}
-
-define void @test_vst2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) {
-; CHECK-LABEL: test_vst2q_lane_f64
-; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [2 x <2 x double>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [2 x <2 x double>] %b.coerce, 1
- %0 = bitcast double* %a to i8*
- tail call void @llvm.arm.neon.vst2lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, i32 1, i32 8)
- ret void
-}
-
-define void @test_vst2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vst2_lane_s8
-; CHECK: st2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %b.coerce, 1
- tail call void @llvm.arm.neon.vst2lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, i32 7, i32 1)
- ret void
-}
-
-define void @test_vst2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vst2_lane_s16
-; CHECK: st2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [2 x <4 x i16>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [2 x <4 x i16>] %b.coerce, 1
- %0 = bitcast i16* %a to i8*
- tail call void @llvm.arm.neon.vst2lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, i32 3, i32 2)
- ret void
-}
-
-define void @test_vst2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vst2_lane_s32
-; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %b.coerce, 1
- %0 = bitcast i32* %a to i8*
- tail call void @llvm.arm.neon.vst2lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, i32 1, i32 4)
- ret void
-}
-
-define void @test_vst2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vst2_lane_s64
-; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [2 x <1 x i64>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [2 x <1 x i64>] %b.coerce, 1
- %0 = bitcast i64* %a to i8*
- tail call void @llvm.arm.neon.vst2lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, i32 0, i32 8)
- ret void
-}
-
-define void @test_vst2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) {
-; CHECK-LABEL: test_vst2_lane_f32
-; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [2 x <2 x float>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [2 x <2 x float>] %b.coerce, 1
- %0 = bitcast float* %a to i8*
- tail call void @llvm.arm.neon.vst2lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, i32 1, i32 4)
- ret void
-}
-
-define void @test_vst2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) {
-; CHECK-LABEL: test_vst2_lane_f64
-; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [2 x <1 x double>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [2 x <1 x double>] %b.coerce, 1
- %0 = bitcast double* %a to i8*
- tail call void @llvm.arm.neon.vst2lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, i32 0, i32 8)
- ret void
-}
-
-define void @test_vst3q_lane_s8(i8* %a, [3 x <16 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vst3q_lane_s8
-; CHECK: st3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %b.coerce, 2
- tail call void @llvm.arm.neon.vst3lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, i32 15, i32 1)
- ret void
-}
-
-define void @test_vst3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vst3q_lane_s16
-; CHECK: st3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [3 x <8 x i16>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [3 x <8 x i16>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [3 x <8 x i16>] %b.coerce, 2
- %0 = bitcast i16* %a to i8*
- tail call void @llvm.arm.neon.vst3lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, i32 7, i32 2)
- ret void
-}
-
-define void @test_vst3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vst3q_lane_s32
-; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %b.coerce, 2
- %0 = bitcast i32* %a to i8*
- tail call void @llvm.arm.neon.vst3lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, i32 3, i32 4)
- ret void
-}
-
-define void @test_vst3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vst3q_lane_s64
-; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [3 x <2 x i64>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [3 x <2 x i64>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [3 x <2 x i64>] %b.coerce, 2
- %0 = bitcast i64* %a to i8*
- tail call void @llvm.arm.neon.vst3lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, i32 1, i32 8)
- ret void
-}
-
-define void @test_vst3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) {
-; CHECK-LABEL: test_vst3q_lane_f32
-; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [3 x <4 x float>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [3 x <4 x float>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [3 x <4 x float>] %b.coerce, 2
- %0 = bitcast float* %a to i8*
- tail call void @llvm.arm.neon.vst3lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, i32 3, i32 4)
- ret void
-}
-
-define void @test_vst3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) {
-; CHECK-LABEL: test_vst3q_lane_f64
-; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [3 x <2 x double>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [3 x <2 x double>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [3 x <2 x double>] %b.coerce, 2
- %0 = bitcast double* %a to i8*
- tail call void @llvm.arm.neon.vst3lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, i32 1, i32 8)
- ret void
-}
-
-define void @test_vst3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vst3_lane_s8
-; CHECK: st3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [3 x <8 x i8>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [3 x <8 x i8>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [3 x <8 x i8>] %b.coerce, 2
- tail call void @llvm.arm.neon.vst3lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, i32 7, i32 1)
- ret void
-}
-
-define void @test_vst3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vst3_lane_s16
-; CHECK: st3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [3 x <4 x i16>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [3 x <4 x i16>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [3 x <4 x i16>] %b.coerce, 2
- %0 = bitcast i16* %a to i8*
- tail call void @llvm.arm.neon.vst3lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, i32 3, i32 2)
- ret void
-}
-
-define void @test_vst3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vst3_lane_s32
-; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %b.coerce, 2
- %0 = bitcast i32* %a to i8*
- tail call void @llvm.arm.neon.vst3lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, i32 1, i32 4)
- ret void
-}
-
-define void @test_vst3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vst3_lane_s64
-; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [3 x <1 x i64>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [3 x <1 x i64>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [3 x <1 x i64>] %b.coerce, 2
- %0 = bitcast i64* %a to i8*
- tail call void @llvm.arm.neon.vst3lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, i32 0, i32 8)
- ret void
-}
-
-define void @test_vst3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) {
-; CHECK-LABEL: test_vst3_lane_f32
-; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [3 x <2 x float>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [3 x <2 x float>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [3 x <2 x float>] %b.coerce, 2
- %0 = bitcast float* %a to i8*
- tail call void @llvm.arm.neon.vst3lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, i32 1, i32 4)
- ret void
-}
-
-define void @test_vst3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) {
-; CHECK-LABEL: test_vst3_lane_f64
-; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [3 x <1 x double>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [3 x <1 x double>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [3 x <1 x double>] %b.coerce, 2
- %0 = bitcast double* %a to i8*
- tail call void @llvm.arm.neon.vst3lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, i32 0, i32 8)
- ret void
-}
-
-define void @test_vst4q_lane_s8(i16* %a, [4 x <16 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vst4q_lane_s8
-; CHECK: st4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [4 x <16 x i8>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [4 x <16 x i8>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [4 x <16 x i8>] %b.coerce, 2
- %b.coerce.fca.3.extract = extractvalue [4 x <16 x i8>] %b.coerce, 3
- %0 = bitcast i16* %a to i8*
- tail call void @llvm.arm.neon.vst4lane.v16i8(i8* %0, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, <16 x i8> %b.coerce.fca.3.extract, i32 15, i32 2)
- ret void
-}
-
-define void @test_vst4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vst4q_lane_s16
-; CHECK: st4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [4 x <8 x i16>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [4 x <8 x i16>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [4 x <8 x i16>] %b.coerce, 2
- %b.coerce.fca.3.extract = extractvalue [4 x <8 x i16>] %b.coerce, 3
- %0 = bitcast i16* %a to i8*
- tail call void @llvm.arm.neon.vst4lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, <8 x i16> %b.coerce.fca.3.extract, i32 7, i32 2)
- ret void
-}
-
-define void @test_vst4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vst4q_lane_s32
-; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %b.coerce, 2
- %b.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %b.coerce, 3
- %0 = bitcast i32* %a to i8*
- tail call void @llvm.arm.neon.vst4lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, <4 x i32> %b.coerce.fca.3.extract, i32 3, i32 4)
- ret void
-}
-
-define void @test_vst4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vst4q_lane_s64
-; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [4 x <2 x i64>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [4 x <2 x i64>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [4 x <2 x i64>] %b.coerce, 2
- %b.coerce.fca.3.extract = extractvalue [4 x <2 x i64>] %b.coerce, 3
- %0 = bitcast i64* %a to i8*
- tail call void @llvm.arm.neon.vst4lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, <2 x i64> %b.coerce.fca.3.extract, i32 1, i32 8)
- ret void
-}
-
-define void @test_vst4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) {
-; CHECK-LABEL: test_vst4q_lane_f32
-; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [4 x <4 x float>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [4 x <4 x float>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [4 x <4 x float>] %b.coerce, 2
- %b.coerce.fca.3.extract = extractvalue [4 x <4 x float>] %b.coerce, 3
- %0 = bitcast float* %a to i8*
- tail call void @llvm.arm.neon.vst4lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, <4 x float> %b.coerce.fca.3.extract, i32 3, i32 4)
- ret void
-}
-
-define void @test_vst4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) {
-; CHECK-LABEL: test_vst4q_lane_f64
-; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [4 x <2 x double>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [4 x <2 x double>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [4 x <2 x double>] %b.coerce, 2
- %b.coerce.fca.3.extract = extractvalue [4 x <2 x double>] %b.coerce, 3
- %0 = bitcast double* %a to i8*
- tail call void @llvm.arm.neon.vst4lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, <2 x double> %b.coerce.fca.3.extract, i32 1, i32 8)
- ret void
-}
-
-define void @test_vst4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) {
-; CHECK-LABEL: test_vst4_lane_s8
-; CHECK: st4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [4 x <8 x i8>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [4 x <8 x i8>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [4 x <8 x i8>] %b.coerce, 2
- %b.coerce.fca.3.extract = extractvalue [4 x <8 x i8>] %b.coerce, 3
- tail call void @llvm.arm.neon.vst4lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, <8 x i8> %b.coerce.fca.3.extract, i32 7, i32 1)
- ret void
-}
-
-define void @test_vst4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) {
-; CHECK-LABEL: test_vst4_lane_s16
-; CHECK: st4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [4 x <4 x i16>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [4 x <4 x i16>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [4 x <4 x i16>] %b.coerce, 2
- %b.coerce.fca.3.extract = extractvalue [4 x <4 x i16>] %b.coerce, 3
- %0 = bitcast i16* %a to i8*
- tail call void @llvm.arm.neon.vst4lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, <4 x i16> %b.coerce.fca.3.extract, i32 3, i32 2)
- ret void
-}
-
-define void @test_vst4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) {
-; CHECK-LABEL: test_vst4_lane_s32
-; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %b.coerce, 2
- %b.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %b.coerce, 3
- %0 = bitcast i32* %a to i8*
- tail call void @llvm.arm.neon.vst4lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, <2 x i32> %b.coerce.fca.3.extract, i32 1, i32 4)
- ret void
-}
-
-define void @test_vst4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) {
-; CHECK-LABEL: test_vst4_lane_s64
-; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [4 x <1 x i64>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [4 x <1 x i64>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [4 x <1 x i64>] %b.coerce, 2
- %b.coerce.fca.3.extract = extractvalue [4 x <1 x i64>] %b.coerce, 3
- %0 = bitcast i64* %a to i8*
- tail call void @llvm.arm.neon.vst4lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, <1 x i64> %b.coerce.fca.3.extract, i32 0, i32 8)
- ret void
-}
-
-define void @test_vst4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) {
-; CHECK-LABEL: test_vst4_lane_f32
-; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [4 x <2 x float>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [4 x <2 x float>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [4 x <2 x float>] %b.coerce, 2
- %b.coerce.fca.3.extract = extractvalue [4 x <2 x float>] %b.coerce, 3
- %0 = bitcast float* %a to i8*
- tail call void @llvm.arm.neon.vst4lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, <2 x float> %b.coerce.fca.3.extract, i32 1, i32 4)
- ret void
-}
-
-define void @test_vst4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) {
-; CHECK-LABEL: test_vst4_lane_f64
-; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
-entry:
- %b.coerce.fca.0.extract = extractvalue [4 x <1 x double>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [4 x <1 x double>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [4 x <1 x double>] %b.coerce, 2
- %b.coerce.fca.3.extract = extractvalue [4 x <1 x double>] %b.coerce, 3
- %0 = bitcast double* %a to i8*
- tail call void @llvm.arm.neon.vst4lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, <1 x double> %b.coerce.fca.3.extract, i32 0, i32 8)
- ret void
-}
-
-declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32)
-declare { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32)
-declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32)
-declare { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8*, <2 x i64>, <2 x i64>, i32, i32)
-declare { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32)
-declare { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8*, <2 x double>, <2 x double>, i32, i32)
-declare { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
-declare { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32)
-declare { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32)
-declare { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8*, i32)
-declare { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32)
-declare { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8*, i32)
-declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
-declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
-declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
-declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
-declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32)
-declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32, i32)
-declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
-declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
-declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
-declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8*, i32)
-declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32)
-declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8*, i32)
-declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
-declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
-declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
-declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
-declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32)
-declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)
-declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
-declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
-declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
-declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8*, i32)
-declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32)
-declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8*, i32)
-declare { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2lane.v1i64(i8*, <1 x i64>, <1 x i64>, i32, i32)
-declare { <1 x double>, <1 x double> } @llvm.arm.neon.vld2lane.v1f64(i8*, <1 x double>, <1 x double>, i32, i32)
-declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
-declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32, i32)
-declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
-declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v2i64(i8*, <2 x i64>, <2 x i64>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v2f64(i8*, <2 x double>, <2 x double>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v1i64(i8*, <1 x i64>, <1 x i64>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v1f64(i8*, <1 x double>, <1 x double>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32, i32)
-
-define %struct.int8x16x2_t @test_vld2q_lane_s8(i8* readonly %ptr, [2 x <16 x i8>] %src.coerce) {
-; CHECK-LABEL: test_vld2q_lane_s8
-; CHECK: ld2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[15], [x0]
-entry:
- %src.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %src.coerce, 0
- %src.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %src.coerce, 1
- %vld2_lane = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, i32 15, i32 1)
- %vld2_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 0
- %vld2_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 1
- %.fca.0.0.insert = insertvalue %struct.int8x16x2_t undef, <16 x i8> %vld2_lane.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int8x16x2_t %.fca.0.0.insert, <16 x i8> %vld2_lane.fca.1.extract, 0, 1
- ret %struct.int8x16x2_t %.fca.0.1.insert
-}
-
-define %struct.uint8x16x2_t @test_vld2q_lane_u8(i8* readonly %ptr, [2 x <16 x i8>] %src.coerce) {
-; CHECK-LABEL: test_vld2q_lane_u8
-; CHECK: ld2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[15], [x0]
-entry:
- %src.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %src.coerce, 0
- %src.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %src.coerce, 1
- %vld2_lane = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, i32 15, i32 1)
- %vld2_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 0
- %vld2_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 1
- %.fca.0.0.insert = insertvalue %struct.uint8x16x2_t undef, <16 x i8> %vld2_lane.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.uint8x16x2_t %.fca.0.0.insert, <16 x i8> %vld2_lane.fca.1.extract, 0, 1
- ret %struct.uint8x16x2_t %.fca.0.1.insert
-}
-
-define %struct.poly8x16x2_t @test_vld2q_lane_p8(i8* readonly %ptr, [2 x <16 x i8>] %src.coerce) {
-; CHECK-LABEL: test_vld2q_lane_p8
-; CHECK: ld2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[15], [x0]
-entry:
- %src.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %src.coerce, 0
- %src.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %src.coerce, 1
- %vld2_lane = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, i32 15, i32 1)
- %vld2_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 0
- %vld2_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 1
- %.fca.0.0.insert = insertvalue %struct.poly8x16x2_t undef, <16 x i8> %vld2_lane.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.poly8x16x2_t %.fca.0.0.insert, <16 x i8> %vld2_lane.fca.1.extract, 0, 1
- ret %struct.poly8x16x2_t %.fca.0.1.insert
-}
-
-define %struct.int8x16x3_t @test_vld3q_lane_s8(i8* readonly %ptr, [3 x <16 x i8>] %src.coerce) {
-; CHECK-LABEL: test_vld3q_lane_s8
-; CHECK: ld3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[15], [x0]
-entry:
- %src.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %src.coerce, 0
- %src.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %src.coerce, 1
- %src.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %src.coerce, 2
- %vld3_lane = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, <16 x i8> %src.coerce.fca.2.extract, i32 15, i32 1)
- %vld3_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 0
- %vld3_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 1
- %vld3_lane.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 2
- %.fca.0.0.insert = insertvalue %struct.int8x16x3_t undef, <16 x i8> %vld3_lane.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int8x16x3_t %.fca.0.0.insert, <16 x i8> %vld3_lane.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int8x16x3_t %.fca.0.1.insert, <16 x i8> %vld3_lane.fca.2.extract, 0, 2
- ret %struct.int8x16x3_t %.fca.0.2.insert
-}
-
-define %struct.uint8x16x3_t @test_vld3q_lane_u8(i8* readonly %ptr, [3 x <16 x i8>] %src.coerce) {
-; CHECK-LABEL: test_vld3q_lane_u8
-; CHECK: ld3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[15], [x0]
-entry:
- %src.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %src.coerce, 0
- %src.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %src.coerce, 1
- %src.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %src.coerce, 2
- %vld3_lane = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, <16 x i8> %src.coerce.fca.2.extract, i32 15, i32 1)
- %vld3_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 0
- %vld3_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 1
- %vld3_lane.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 2
- %.fca.0.0.insert = insertvalue %struct.uint8x16x3_t undef, <16 x i8> %vld3_lane.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.uint8x16x3_t %.fca.0.0.insert, <16 x i8> %vld3_lane.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.uint8x16x3_t %.fca.0.1.insert, <16 x i8> %vld3_lane.fca.2.extract, 0, 2
- ret %struct.uint8x16x3_t %.fca.0.2.insert
-}
-
diff --git a/test/CodeGen/AArch64/neon-simd-ldst.ll b/test/CodeGen/AArch64/neon-simd-ldst.ll
deleted file mode 100644
index afc0901..0000000
--- a/test/CodeGen/AArch64/neon-simd-ldst.ll
+++ /dev/null
@@ -1,164 +0,0 @@
-; RUN: llc < %s -O2 -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-define void @test_ldstq_4v(i8* noalias %io, i32 %count) {
-; CHECK-LABEL: test_ldstq_4v
-; CHECK: ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0]
-; CHECK: st4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0]
-entry:
- %tobool62 = icmp eq i32 %count, 0
- br i1 %tobool62, label %while.end, label %while.body
-
-while.body: ; preds = %entry, %while.body
- %count.addr.063 = phi i32 [ %dec, %while.body ], [ %count, %entry ]
- %dec = add i32 %count.addr.063, -1
- %vld4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8* %io, i32 1)
- %vld4.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 0
- %vld4.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 1
- %vld4.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 2
- %vld4.fca.3.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 3
- tail call void @llvm.arm.neon.vst4.v16i8(i8* %io, <16 x i8> %vld4.fca.0.extract, <16 x i8> %vld4.fca.1.extract, <16 x i8> %vld4.fca.2.extract, <16 x i8> %vld4.fca.3.extract, i32 1)
- %tobool = icmp eq i32 %dec, 0
- br i1 %tobool, label %while.end, label %while.body
-
-while.end: ; preds = %while.body, %entry
- ret void
-}
-
-declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8*, i32)
-
-declare void @llvm.arm.neon.vst4.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32)
-
-define void @test_ldstq_3v(i8* noalias %io, i32 %count) {
-; CHECK-LABEL: test_ldstq_3v
-; CHECK: ld3 {v0.16b, v1.16b, v2.16b}, [x0]
-; CHECK: st3 {v0.16b, v1.16b, v2.16b}, [x0]
-entry:
- %tobool47 = icmp eq i32 %count, 0
- br i1 %tobool47, label %while.end, label %while.body
-
-while.body: ; preds = %entry, %while.body
- %count.addr.048 = phi i32 [ %dec, %while.body ], [ %count, %entry ]
- %dec = add i32 %count.addr.048, -1
- %vld3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* %io, i32 1)
- %vld3.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 0
- %vld3.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 1
- %vld3.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 2
- tail call void @llvm.arm.neon.vst3.v16i8(i8* %io, <16 x i8> %vld3.fca.0.extract, <16 x i8> %vld3.fca.1.extract, <16 x i8> %vld3.fca.2.extract, i32 1)
- %tobool = icmp eq i32 %dec, 0
- br i1 %tobool, label %while.end, label %while.body
-
-while.end: ; preds = %while.body, %entry
- ret void
-}
-
-declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8*, i32)
-
-declare void @llvm.arm.neon.vst3.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32)
-
-define void @test_ldstq_2v(i8* noalias %io, i32 %count) {
-; CHECK-LABEL: test_ldstq_2v
-; CHECK: ld2 {v0.16b, v1.16b}, [x0]
-; CHECK: st2 {v0.16b, v1.16b}, [x0]
-entry:
- %tobool22 = icmp eq i32 %count, 0
- br i1 %tobool22, label %while.end, label %while.body
-
-while.body: ; preds = %entry, %while.body
- %count.addr.023 = phi i32 [ %dec, %while.body ], [ %count, %entry ]
- %dec = add i32 %count.addr.023, -1
- %vld2 = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8* %io, i32 1)
- %vld2.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2, 0
- %vld2.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2, 1
- tail call void @llvm.arm.neon.vst2.v16i8(i8* %io, <16 x i8> %vld2.fca.0.extract, <16 x i8> %vld2.fca.1.extract, i32 1)
- %tobool = icmp eq i32 %dec, 0
- br i1 %tobool, label %while.end, label %while.body
-
-while.end: ; preds = %while.body, %entry
- ret void
-}
-
-declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8*, i32)
-
-declare void @llvm.arm.neon.vst2.v16i8(i8*, <16 x i8>, <16 x i8>, i32)
-
-define void @test_ldst_4v(i8* noalias %io, i32 %count) {
-; CHECK-LABEL: test_ldst_4v
-; CHECK: ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0]
-; CHECK: st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0]
-entry:
- %tobool42 = icmp eq i32 %count, 0
- br i1 %tobool42, label %while.end, label %while.body
-
-while.body: ; preds = %entry, %while.body
- %count.addr.043 = phi i32 [ %dec, %while.body ], [ %count, %entry ]
- %dec = add i32 %count.addr.043, -1
- %vld4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8* %io, i32 1)
- %vld4.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 0
- %vld4.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 1
- %vld4.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 2
- %vld4.fca.3.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 3
- tail call void @llvm.arm.neon.vst4.v8i8(i8* %io, <8 x i8> %vld4.fca.0.extract, <8 x i8> %vld4.fca.1.extract, <8 x i8> %vld4.fca.2.extract, <8 x i8> %vld4.fca.3.extract, i32 1)
- %tobool = icmp eq i32 %dec, 0
- br i1 %tobool, label %while.end, label %while.body
-
-while.end: ; preds = %while.body, %entry
- ret void
-}
-
-declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8*, i32)
-
-declare void @llvm.arm.neon.vst4.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32)
-
-define void @test_ldst_3v(i8* noalias %io, i32 %count) {
-; CHECK-LABEL: test_ldst_3v
-; CHECK: ld3 {v0.8b, v1.8b, v2.8b}, [x0]
-; CHECK: st3 {v0.8b, v1.8b, v2.8b}, [x0]
-entry:
- %tobool32 = icmp eq i32 %count, 0
- br i1 %tobool32, label %while.end, label %while.body
-
-while.body: ; preds = %entry, %while.body
- %count.addr.033 = phi i32 [ %dec, %while.body ], [ %count, %entry ]
- %dec = add i32 %count.addr.033, -1
- %vld3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8(i8* %io, i32 1)
- %vld3.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 0
- %vld3.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 1
- %vld3.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 2
- tail call void @llvm.arm.neon.vst3.v8i8(i8* %io, <8 x i8> %vld3.fca.0.extract, <8 x i8> %vld3.fca.1.extract, <8 x i8> %vld3.fca.2.extract, i32 1)
- %tobool = icmp eq i32 %dec, 0
- br i1 %tobool, label %while.end, label %while.body
-
-while.end: ; preds = %while.body, %entry
- ret void
-}
-
-declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8(i8*, i32)
-
-declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32)
-
-define void @test_ldst_2v(i8* noalias %io, i32 %count) {
-; CHECK-LABEL: test_ldst_2v
-; CHECK: ld2 {v0.8b, v1.8b}, [x0]
-; CHECK: st2 {v0.8b, v1.8b}, [x0]
-entry:
- %tobool22 = icmp eq i32 %count, 0
- br i1 %tobool22, label %while.end, label %while.body
-
-while.body: ; preds = %entry, %while.body
- %count.addr.023 = phi i32 [ %dec, %while.body ], [ %count, %entry ]
- %dec = add i32 %count.addr.023, -1
- %vld2 = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8(i8* %io, i32 1)
- %vld2.fca.0.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2, 0
- %vld2.fca.1.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2, 1
- tail call void @llvm.arm.neon.vst2.v8i8(i8* %io, <8 x i8> %vld2.fca.0.extract, <8 x i8> %vld2.fca.1.extract, i32 1)
- %tobool = icmp eq i32 %dec, 0
- br i1 %tobool, label %while.end, label %while.body
-
-while.end: ; preds = %while.body, %entry
- ret void
-}
-
-declare { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8(i8*, i32)
-
-declare void @llvm.arm.neon.vst2.v8i8(i8*, <8 x i8>, <8 x i8>, i32)
-
diff --git a/test/CodeGen/AArch64/neon-simd-post-ldst-multi-elem.ll b/test/CodeGen/AArch64/neon-simd-post-ldst-multi-elem.ll
deleted file mode 100644
index 156fe1d..0000000
--- a/test/CodeGen/AArch64/neon-simd-post-ldst-multi-elem.ll
+++ /dev/null
@@ -1,354 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-;Check for a post-increment updating load.
-define <4 x i16> @test_vld1_fx_update(i16** %ptr) nounwind {
-; CHECK: test_vld1_fx_update
-; CHECK: ld1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}], #8
- %A = load i16** %ptr
- %tmp0 = bitcast i16* %A to i8*
- %tmp1 = call <4 x i16> @llvm.arm.neon.vld1.v4i16(i8* %tmp0, i32 2)
- %tmp2 = getelementptr i16* %A, i32 4
- store i16* %tmp2, i16** %ptr
- ret <4 x i16> %tmp1
-}
-
-;Check for a post-increment updating load with register increment.
-define <2 x i32> @test_vld1_reg_update(i32** %ptr, i32 %inc) nounwind {
-; CHECK: test_vld1_reg_update
-; CHECK: ld1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
- %A = load i32** %ptr
- %tmp0 = bitcast i32* %A to i8*
- %tmp1 = call <2 x i32> @llvm.arm.neon.vld1.v2i32(i8* %tmp0, i32 4)
- %tmp2 = getelementptr i32* %A, i32 %inc
- store i32* %tmp2, i32** %ptr
- ret <2 x i32> %tmp1
-}
-
-define <2 x float> @test_vld2_fx_update(float** %ptr) nounwind {
-; CHECK: test_vld2_fx_update
-; CHECK: ld2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}], #16
- %A = load float** %ptr
- %tmp0 = bitcast float* %A to i8*
- %tmp1 = call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2.v2f32(i8* %tmp0, i32 4)
- %tmp2 = extractvalue { <2 x float>, <2 x float> } %tmp1, 0
- %tmp3 = getelementptr float* %A, i32 4
- store float* %tmp3, float** %ptr
- ret <2 x float> %tmp2
-}
-
-define <16 x i8> @test_vld2_reg_update(i8** %ptr, i32 %inc) nounwind {
-; CHECK: test_vld2_reg_update
-; CHECK: ld2 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
- %A = load i8** %ptr
- %tmp0 = call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8* %A, i32 1)
- %tmp1 = extractvalue { <16 x i8>, <16 x i8> } %tmp0, 0
- %tmp2 = getelementptr i8* %A, i32 %inc
- store i8* %tmp2, i8** %ptr
- ret <16 x i8> %tmp1
-}
-
-define <4 x i32> @test_vld3_fx_update(i32** %ptr) nounwind {
-; CHECK: test_vld3_fx_update
-; CHECK: ld3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}], #48
- %A = load i32** %ptr
- %tmp0 = bitcast i32* %A to i8*
- %tmp1 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32(i8* %tmp0, i32 4)
- %tmp2 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %tmp1, 0
- %tmp3 = getelementptr i32* %A, i32 12
- store i32* %tmp3, i32** %ptr
- ret <4 x i32> %tmp2
-}
-
-define <4 x i16> @test_vld3_reg_update(i16** %ptr, i32 %inc) nounwind {
-; CHECK: test_vld3_reg_update
-; CHECK: ld3 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
- %A = load i16** %ptr
- %tmp0 = bitcast i16* %A to i8*
- %tmp1 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8* %tmp0, i32 2)
- %tmp2 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %tmp1, 0
- %tmp3 = getelementptr i16* %A, i32 %inc
- store i16* %tmp3, i16** %ptr
- ret <4 x i16> %tmp2
-}
-
-define <8 x i16> @test_vld4_fx_update(i16** %ptr) nounwind {
-; CHECK: test_vld4_fx_update
-; CHECK: ld4 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}], #64
- %A = load i16** %ptr
- %tmp0 = bitcast i16* %A to i8*
- %tmp1 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16(i8* %tmp0, i32 8)
- %tmp2 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %tmp1, 0
- %tmp3 = getelementptr i16* %A, i32 32
- store i16* %tmp3, i16** %ptr
- ret <8 x i16> %tmp2
-}
-
-define <8 x i8> @test_vld4_reg_update(i8** %ptr, i32 %inc) nounwind {
-; CHECK: test_vld4_reg_update
-; CHECK: ld4 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
- %A = load i8** %ptr
- %tmp0 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8* %A, i32 1)
- %tmp1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %tmp0, 0
- %tmp2 = getelementptr i8* %A, i32 %inc
- store i8* %tmp2, i8** %ptr
- ret <8 x i8> %tmp1
-}
-
-define void @test_vst1_fx_update(float** %ptr, <2 x float> %B) nounwind {
-; CHECK: test_vst1_fx_update
-; CHECK: st1 {v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}], #8
- %A = load float** %ptr
- %tmp0 = bitcast float* %A to i8*
- call void @llvm.arm.neon.vst1.v2f32(i8* %tmp0, <2 x float> %B, i32 4)
- %tmp2 = getelementptr float* %A, i32 2
- store float* %tmp2, float** %ptr
- ret void
-}
-
-define void @test_vst1_reg_update(i16** %ptr, <8 x i16> %B, i32 %inc) nounwind {
-; CHECK: test_vst1_reg_update
-; CHECK: st1 {v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}], x{{[0-9]+}}
- %A = load i16** %ptr
- %tmp0 = bitcast i16* %A to i8*
- call void @llvm.arm.neon.vst1.v8i16(i8* %tmp0, <8 x i16> %B, i32 2)
- %tmp1 = getelementptr i16* %A, i32 %inc
- store i16* %tmp1, i16** %ptr
- ret void
-}
-
-define void @test_vst2_fx_update(i64** %ptr, <1 x i64> %B) nounwind {
-; CHECK: test_vst2_fx_update
-; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}], #16
- %A = load i64** %ptr
- %tmp0 = bitcast i64* %A to i8*
- call void @llvm.arm.neon.vst2.v1i64(i8* %tmp0, <1 x i64> %B, <1 x i64> %B, i32 8)
- %tmp1 = getelementptr i64* %A, i32 2
- store i64* %tmp1, i64** %ptr
- ret void
-}
-
-define void @test_vst2_reg_update(i8** %ptr, <8 x i8> %B, i32 %inc) nounwind {
-; CHECK: test_vst2_reg_update
-; CHECK: st2 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}], x{{[0-9]+}}
- %A = load i8** %ptr
- call void @llvm.arm.neon.vst2.v8i8(i8* %A, <8 x i8> %B, <8 x i8> %B, i32 4)
- %tmp0 = getelementptr i8* %A, i32 %inc
- store i8* %tmp0, i8** %ptr
- ret void
-}
-
-define void @test_vst3_fx_update(i32** %ptr, <2 x i32> %B) nounwind {
-; CHECK: test_vst3_fx_update
-; CHECK: st3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}], #24
- %A = load i32** %ptr
- %tmp0 = bitcast i32* %A to i8*
- call void @llvm.arm.neon.vst3.v2i32(i8* %tmp0, <2 x i32> %B, <2 x i32> %B, <2 x i32> %B, i32 4)
- %tmp1 = getelementptr i32* %A, i32 6
- store i32* %tmp1, i32** %ptr
- ret void
-}
-
-define void @test_vst3_reg_update(i16** %ptr, <8 x i16> %B, i32 %inc) nounwind {
-; CHECK: test_vst3_reg_update
-; CHECK: st3 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}], x{{[0-9]+}}
- %A = load i16** %ptr
- %tmp0 = bitcast i16* %A to i8*
- call void @llvm.arm.neon.vst3.v8i16(i8* %tmp0, <8 x i16> %B, <8 x i16> %B, <8 x i16> %B, i32 2)
- %tmp1 = getelementptr i16* %A, i32 %inc
- store i16* %tmp1, i16** %ptr
- ret void
-}
-
-define void @test_vst4_fx_update(float** %ptr, <4 x float> %B) nounwind {
-; CHECK: test_vst4_fx_update
-; CHECK: st4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}], #64
- %A = load float** %ptr
- %tmp0 = bitcast float* %A to i8*
- call void @llvm.arm.neon.vst4.v4f32(i8* %tmp0, <4 x float> %B, <4 x float> %B, <4 x float> %B, <4 x float> %B, i32 4)
- %tmp1 = getelementptr float* %A, i32 16
- store float* %tmp1, float** %ptr
- ret void
-}
-
-define void @test_vst4_reg_update(i8** %ptr, <8 x i8> %B, i32 %inc) nounwind {
-; CHECK: test_vst4_reg_update
-; CHECK: st4 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}], x{{[0-9]+}}
- %A = load i8** %ptr
- call void @llvm.arm.neon.vst4.v8i8(i8* %A, <8 x i8> %B, <8 x i8> %B, <8 x i8> %B, <8 x i8> %B, i32 1)
- %tmp0 = getelementptr i8* %A, i32 %inc
- store i8* %tmp0, i8** %ptr
- ret void
-}
-
-
-declare <4 x i16> @llvm.arm.neon.vld1.v4i16(i8*, i32)
-declare <2 x i32> @llvm.arm.neon.vld1.v2i32(i8*, i32)
-declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8*, i32)
-declare { <2 x float>, <2 x float> } @llvm.arm.neon.vld2.v2f32(i8*, i32)
-declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8*, i32)
-declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32(i8*, i32)
-declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16(i8*, i32)
-declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8*, i32)
-
-declare void @llvm.arm.neon.vst1.v2f32(i8*, <2 x float>, i32)
-declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32)
-declare void @llvm.arm.neon.vst2.v1i64(i8*, <1 x i64>, <1 x i64>, i32)
-declare void @llvm.arm.neon.vst2.v8i8(i8*, <8 x i8>, <8 x i8>, i32)
-declare void @llvm.arm.neon.vst3.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32)
-declare void @llvm.arm.neon.vst3.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32)
-declare void @llvm.arm.neon.vst4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32)
-declare void @llvm.arm.neon.vst4.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32)
-
-define <16 x i8> @test_vld1x2_fx_update(i8* %a, i8** %ptr) {
-; CHECK: test_vld1x2_fx_update
-; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}], #32
- %1 = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x2.v16i8(i8* %a, i32 1)
- %2 = extractvalue { <16 x i8>, <16 x i8> } %1, 0
- %tmp1 = getelementptr i8* %a, i32 32
- store i8* %tmp1, i8** %ptr
- ret <16 x i8> %2
-}
-
-define <8 x i16> @test_vld1x2_reg_update(i16* %a, i16** %ptr, i32 %inc) {
-; CHECK: test_vld1x2_reg_update
-; CHECK: ld1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
- %1 = bitcast i16* %a to i8*
- %2 = tail call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x2.v8i16(i8* %1, i32 2)
- %3 = extractvalue { <8 x i16>, <8 x i16> } %2, 0
- %tmp1 = getelementptr i16* %a, i32 %inc
- store i16* %tmp1, i16** %ptr
- ret <8 x i16> %3
-}
-
-define <2 x i64> @test_vld1x3_fx_update(i64* %a, i64** %ptr) {
-; CHECK: test_vld1x3_fx_update
-; CHECK: ld1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}], #48
- %1 = bitcast i64* %a to i8*
- %2 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x3.v2i64(i8* %1, i32 8)
- %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %2, 0
- %tmp1 = getelementptr i64* %a, i32 6
- store i64* %tmp1, i64** %ptr
- ret <2 x i64> %3
-}
-
-define <8 x i16> @test_vld1x3_reg_update(i16* %a, i16** %ptr, i32 %inc) {
-; CHECK: test_vld1x3_reg_update
-; CHECK: ld1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
- %1 = bitcast i16* %a to i8*
- %2 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x3.v8i16(i8* %1, i32 2)
- %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %2, 0
- %tmp1 = getelementptr i16* %a, i32 %inc
- store i16* %tmp1, i16** %ptr
- ret <8 x i16> %3
-}
-
-define <4 x float> @test_vld1x4_fx_update(float* %a, float** %ptr) {
-; CHECK: test_vld1x4_fx_update
-; CHECK: ld1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}], #64
- %1 = bitcast float* %a to i8*
- %2 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x4.v4f32(i8* %1, i32 4)
- %3 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 0
- %tmp1 = getelementptr float* %a, i32 16
- store float* %tmp1, float** %ptr
- ret <4 x float> %3
-}
-
-define <8 x i8> @test_vld1x4_reg_update(i8* readonly %a, i8** %ptr, i32 %inc) #0 {
-; CHECK: test_vld1x4_reg_update
-; CHECK: ld1 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
- %1 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x4.v8i8(i8* %a, i32 1)
- %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 0
- %tmp1 = getelementptr i8* %a, i32 %inc
- store i8* %tmp1, i8** %ptr
- ret <8 x i8> %2
-}
-
-define void @test_vst1x2_fx_update(i8* %a, [2 x <16 x i8>] %b.coerce, i8** %ptr) #2 {
-; CHECK: test_vst1x2_fx_update
-; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}], #32
- %1 = extractvalue [2 x <16 x i8>] %b.coerce, 0
- %2 = extractvalue [2 x <16 x i8>] %b.coerce, 1
- tail call void @llvm.aarch64.neon.vst1x2.v16i8(i8* %a, <16 x i8> %1, <16 x i8> %2, i32 1)
- %tmp1 = getelementptr i8* %a, i32 32
- store i8* %tmp1, i8** %ptr
- ret void
-}
-
-define void @test_vst1x2_reg_update(i16* %a, [2 x <8 x i16>] %b.coerce, i16** %ptr, i32 %inc) #2 {
-; CHECK: test_vst1x2_reg_update
-; CHECK: st1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
- %1 = extractvalue [2 x <8 x i16>] %b.coerce, 0
- %2 = extractvalue [2 x <8 x i16>] %b.coerce, 1
- %3 = bitcast i16* %a to i8*
- tail call void @llvm.aarch64.neon.vst1x2.v8i16(i8* %3, <8 x i16> %1, <8 x i16> %2, i32 2)
- %tmp1 = getelementptr i16* %a, i32 %inc
- store i16* %tmp1, i16** %ptr
- ret void
-}
-
-define void @test_vst1x3_fx_update(i32* %a, [3 x <2 x i32>] %b.coerce, i32** %ptr) #2 {
-; CHECK: test_vst1x3_fx_update
-; CHECK: st1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}], #24
- %1 = extractvalue [3 x <2 x i32>] %b.coerce, 0
- %2 = extractvalue [3 x <2 x i32>] %b.coerce, 1
- %3 = extractvalue [3 x <2 x i32>] %b.coerce, 2
- %4 = bitcast i32* %a to i8*
- tail call void @llvm.aarch64.neon.vst1x3.v2i32(i8* %4, <2 x i32> %1, <2 x i32> %2, <2 x i32> %3, i32 4)
- %tmp1 = getelementptr i32* %a, i32 6
- store i32* %tmp1, i32** %ptr
- ret void
-}
-
-define void @test_vst1x3_reg_update(i64* %a, [3 x <1 x i64>] %b.coerce, i64** %ptr, i32 %inc) #2 {
-; CHECK: test_vst1x3_reg_update
-; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
- %1 = extractvalue [3 x <1 x i64>] %b.coerce, 0
- %2 = extractvalue [3 x <1 x i64>] %b.coerce, 1
- %3 = extractvalue [3 x <1 x i64>] %b.coerce, 2
- %4 = bitcast i64* %a to i8*
- tail call void @llvm.aarch64.neon.vst1x3.v1i64(i8* %4, <1 x i64> %1, <1 x i64> %2, <1 x i64> %3, i32 8)
- %tmp1 = getelementptr i64* %a, i32 %inc
- store i64* %tmp1, i64** %ptr
- ret void
-}
-
-define void @test_vst1x4_fx_update(float* %a, [4 x <4 x float>] %b.coerce, float** %ptr) #2 {
-; CHECK: test_vst1x4_fx_update
-; CHECK: st1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}], #64
- %1 = extractvalue [4 x <4 x float>] %b.coerce, 0
- %2 = extractvalue [4 x <4 x float>] %b.coerce, 1
- %3 = extractvalue [4 x <4 x float>] %b.coerce, 2
- %4 = extractvalue [4 x <4 x float>] %b.coerce, 3
- %5 = bitcast float* %a to i8*
- tail call void @llvm.aarch64.neon.vst1x4.v4f32(i8* %5, <4 x float> %1, <4 x float> %2, <4 x float> %3, <4 x float> %4, i32 4)
- %tmp1 = getelementptr float* %a, i32 16
- store float* %tmp1, float** %ptr
- ret void
-}
-
-define void @test_vst1x4_reg_update(double* %a, [4 x <2 x double>] %b.coerce, double** %ptr, i32 %inc) #2 {
-; CHECK: test_vst1x4_reg_update
-; CHECK: st1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
- %1 = extractvalue [4 x <2 x double>] %b.coerce, 0
- %2 = extractvalue [4 x <2 x double>] %b.coerce, 1
- %3 = extractvalue [4 x <2 x double>] %b.coerce, 2
- %4 = extractvalue [4 x <2 x double>] %b.coerce, 3
- %5 = bitcast double* %a to i8*
- tail call void @llvm.aarch64.neon.vst1x4.v2f64(i8* %5, <2 x double> %1, <2 x double> %2, <2 x double> %3, <2 x double> %4, i32 8)
- %tmp1 = getelementptr double* %a, i32 %inc
- store double* %tmp1, double** %ptr
- ret void
-}
-
-declare { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x2.v16i8(i8*, i32)
-declare { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x2.v8i16(i8*, i32)
-declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x3.v2i64(i8*, i32)
-declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x3.v8i16(i8*, i32)
-declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x4.v4f32(i8*, i32)
-declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x4.v8i8(i8*, i32)
-declare void @llvm.aarch64.neon.vst1x2.v16i8(i8*, <16 x i8>, <16 x i8>, i32)
-declare void @llvm.aarch64.neon.vst1x2.v8i16(i8*, <8 x i16>, <8 x i16>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32)
-declare void @llvm.aarch64.neon.vst1x3.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32)
-declare void @llvm.aarch64.neon.vst1x4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32) #3
-declare void @llvm.aarch64.neon.vst1x4.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32) #3
diff --git a/test/CodeGen/AArch64/neon-simd-post-ldst-one.ll b/test/CodeGen/AArch64/neon-simd-post-ldst-one.ll
deleted file mode 100644
index 80a9347..0000000
--- a/test/CodeGen/AArch64/neon-simd-post-ldst-one.ll
+++ /dev/null
@@ -1,319 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-define { [2 x <16 x i8>] } @test_vld2q_dup_fx_update(i8* %a, i8** %ptr) {
-; CHECK-LABEL: test_vld2q_dup_fx_update
-; CHECK: ld2r {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}], #2
- %1 = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
- %2 = extractvalue { <16 x i8>, <16 x i8> } %1, 0
- %3 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
- %4 = extractvalue { <16 x i8>, <16 x i8> } %1, 1
- %5 = shufflevector <16 x i8> %4, <16 x i8> undef, <16 x i32> zeroinitializer
- %6 = insertvalue { [2 x <16 x i8>] } undef, <16 x i8> %3, 0, 0
- %7 = insertvalue { [2 x <16 x i8>] } %6, <16 x i8> %5, 0, 1
- %tmp1 = getelementptr i8* %a, i32 2
- store i8* %tmp1, i8** %ptr
- ret { [2 x <16 x i8>] } %7
-}
-
-define { [2 x <4 x i32>] } @test_vld2q_dup_reg_update(i32* %a, i32** %ptr, i32 %inc) {
-; CHECK-LABEL: test_vld2q_dup_reg_update
-; CHECK: ld2r {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
- %1 = bitcast i32* %a to i8*
- %2 = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %1, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
- %3 = extractvalue { <4 x i32>, <4 x i32> } %2, 0
- %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
- %5 = extractvalue { <4 x i32>, <4 x i32> } %2, 1
- %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <4 x i32> zeroinitializer
- %7 = insertvalue { [2 x <4 x i32>] } undef, <4 x i32> %4, 0, 0
- %8 = insertvalue { [2 x <4 x i32>] } %7, <4 x i32> %6, 0, 1
- %tmp1 = getelementptr i32* %a, i32 %inc
- store i32* %tmp1, i32** %ptr
- ret { [2 x <4 x i32>] } %8
-}
-
-define { [3 x <4 x i16>] } @test_vld3_dup_fx_update(i16* %a, i16** %ptr) {
-; CHECK-LABEL: test_vld3_dup_fx_update
-; CHECK: ld3r {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}], #6
- %1 = bitcast i16* %a to i8*
- %2 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %1, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
- %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 0
- %4 = shufflevector <4 x i16> %3, <4 x i16> undef, <4 x i32> zeroinitializer
- %5 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 1
- %6 = shufflevector <4 x i16> %5, <4 x i16> undef, <4 x i32> zeroinitializer
- %7 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 2
- %8 = shufflevector <4 x i16> %7, <4 x i16> undef, <4 x i32> zeroinitializer
- %9 = insertvalue { [3 x <4 x i16>] } undef, <4 x i16> %4, 0, 0
- %10 = insertvalue { [3 x <4 x i16>] } %9, <4 x i16> %6, 0, 1
- %11 = insertvalue { [3 x <4 x i16>] } %10, <4 x i16> %8, 0, 2
- %tmp1 = getelementptr i16* %a, i32 3
- store i16* %tmp1, i16** %ptr
- ret { [3 x <4 x i16>] } %11
-}
-
-define { [3 x <8 x i8>] } @test_vld3_dup_reg_update(i8* %a, i8** %ptr, i32 %inc) {
-; CHECK-LABEL: test_vld3_dup_reg_update
-; CHECK: ld3r {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
- %1 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
- %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 0
- %3 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
- %4 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 1
- %5 = shufflevector <8 x i8> %4, <8 x i8> undef, <8 x i32> zeroinitializer
- %6 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 2
- %7 = shufflevector <8 x i8> %6, <8 x i8> undef, <8 x i32> zeroinitializer
- %8 = insertvalue { [3 x <8 x i8>] } undef, <8 x i8> %3, 0, 0
- %9 = insertvalue { [3 x <8 x i8>] } %8, <8 x i8> %5, 0, 1
- %10 = insertvalue { [3 x <8 x i8>] } %9, <8 x i8> %7, 0, 2
- %tmp1 = getelementptr i8* %a, i32 %inc
- store i8* %tmp1, i8** %ptr
- ret { [3 x <8 x i8>] }%10
-}
-
-define { [4 x <2 x i32>] } @test_vld4_dup_fx_update(i32* %a, i32** %ptr) #0 {
-; CHECK-LABEL: test_vld4_dup_fx_update
-; CHECK: ld4r {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}], #16
- %1 = bitcast i32* %a to i8*
- %2 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %1, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
- %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 0
- %4 = shufflevector <2 x i32> %3, <2 x i32> undef, <2 x i32> zeroinitializer
- %5 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 1
- %6 = shufflevector <2 x i32> %5, <2 x i32> undef, <2 x i32> zeroinitializer
- %7 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 2
- %8 = shufflevector <2 x i32> %7, <2 x i32> undef, <2 x i32> zeroinitializer
- %9 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 3
- %10 = shufflevector <2 x i32> %9, <2 x i32> undef, <2 x i32> zeroinitializer
- %11 = insertvalue { [4 x <2 x i32>] } undef, <2 x i32> %4, 0, 0
- %12 = insertvalue { [4 x <2 x i32>] } %11, <2 x i32> %6, 0, 1
- %13 = insertvalue { [4 x <2 x i32>] } %12, <2 x i32> %8, 0, 2
- %14 = insertvalue { [4 x <2 x i32>] } %13, <2 x i32> %10, 0, 3
- %tmp1 = getelementptr i32* %a, i32 4
- store i32* %tmp1, i32** %ptr
- ret { [4 x <2 x i32>] } %14
-}
-
-define { [4 x <2 x double>] } @test_vld4_dup_reg_update(double* %a, double** %ptr, i32 %inc) {
-; CHECK-LABEL: test_vld4_dup_reg_update
-; CHECK: ld4r {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
- %1 = bitcast double* %a to i8*
- %2 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %1, <2 x double> undef, <2 x double> undef, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
- %3 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 0
- %4 = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> zeroinitializer
- %5 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 1
- %6 = shufflevector <2 x double> %5, <2 x double> undef, <2 x i32> zeroinitializer
- %7 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 2
- %8 = shufflevector <2 x double> %7, <2 x double> undef, <2 x i32> zeroinitializer
- %9 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 3
- %10 = shufflevector <2 x double> %9, <2 x double> undef, <2 x i32> zeroinitializer
- %11 = insertvalue { [4 x <2 x double>] } undef, <2 x double> %4, 0, 0
- %12 = insertvalue { [4 x <2 x double>] } %11, <2 x double> %6, 0, 1
- %13 = insertvalue { [4 x <2 x double>] } %12, <2 x double> %8, 0, 2
- %14 = insertvalue { [4 x <2 x double>] } %13, <2 x double> %10, 0, 3
- %tmp1 = getelementptr double* %a, i32 %inc
- store double* %tmp1, double** %ptr
- ret { [4 x <2 x double>] } %14
-}
-
-define { [2 x <8 x i8>] } @test_vld2_lane_fx_update(i8* %a, [2 x <8 x i8>] %b, i8** %ptr) {
-; CHECK-LABEL: test_vld2_lane_fx_update
-; CHECK: ld2 {v{{[0-9]+}}.b, v{{[0-9]+}}.b}[7], [x{{[0-9]+|sp}}], #2
- %1 = extractvalue [2 x <8 x i8>] %b, 0
- %2 = extractvalue [2 x <8 x i8>] %b, 1
- %3 = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, i32 7, i32 1)
- %4 = extractvalue { <8 x i8>, <8 x i8> } %3, 0
- %5 = extractvalue { <8 x i8>, <8 x i8> } %3, 1
- %6 = insertvalue { [2 x <8 x i8>] } undef, <8 x i8> %4, 0, 0
- %7 = insertvalue { [2 x <8 x i8>] } %6, <8 x i8> %5, 0, 1
- %tmp1 = getelementptr i8* %a, i32 2
- store i8* %tmp1, i8** %ptr
- ret { [2 x <8 x i8>] } %7
-}
-
-define { [2 x <8 x i8>] } @test_vld2_lane_reg_update(i8* %a, [2 x <8 x i8>] %b, i8** %ptr, i32 %inc) {
-; CHECK-LABEL: test_vld2_lane_reg_update
-; CHECK: ld2 {v{{[0-9]+}}.b, v{{[0-9]+}}.b}[6], [x{{[0-9]+|sp}}], x{{[0-9]+}}
- %1 = extractvalue [2 x <8 x i8>] %b, 0
- %2 = extractvalue [2 x <8 x i8>] %b, 1
- %3 = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, i32 6, i32 1)
- %4 = extractvalue { <8 x i8>, <8 x i8> } %3, 0
- %5 = extractvalue { <8 x i8>, <8 x i8> } %3, 1
- %6 = insertvalue { [2 x <8 x i8>] } undef, <8 x i8> %4, 0, 0
- %7 = insertvalue { [2 x <8 x i8>] } %6, <8 x i8> %5, 0, 1
- %tmp1 = getelementptr i8* %a, i32 %inc
- store i8* %tmp1, i8** %ptr
- ret { [2 x <8 x i8>] } %7
-}
-
-define { [3 x <2 x float>] } @test_vld3_lane_fx_update(float* %a, [3 x <2 x float>] %b, float** %ptr) {
-; CHECK-LABEL: test_vld3_lane_fx_update
-; CHECK: ld3 {v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s}[1], [x{{[0-9]+|sp}}], #12
- %1 = extractvalue [3 x <2 x float>] %b, 0
- %2 = extractvalue [3 x <2 x float>] %b, 1
- %3 = extractvalue [3 x <2 x float>] %b, 2
- %4 = bitcast float* %a to i8*
- %5 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8* %4, <2 x float> %1, <2 x float> %2, <2 x float> %3, i32 1, i32 4)
- %6 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %5, 0
- %7 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %5, 1
- %8 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %5, 2
- %9 = insertvalue { [3 x <2 x float>] } undef, <2 x float> %6, 0, 0
- %10 = insertvalue { [3 x <2 x float>] } %9, <2 x float> %7, 0, 1
- %11 = insertvalue { [3 x <2 x float>] } %10, <2 x float> %8, 0, 2
- %tmp1 = getelementptr float* %a, i32 3
- store float* %tmp1, float** %ptr
- ret { [3 x <2 x float>] } %11
-}
-
-define { [3 x <4 x i16>] } @test_vld3_lane_reg_update(i16* %a, [3 x <4 x i16>] %b, i16** %ptr, i32 %inc) {
-; CHECK-LABEL: test_vld3_lane_reg_update
-; CHECK: ld3 {v{{[0-9]+}}.h, v{{[0-9]+}}.h, v{{[0-9]+}}.h}[3], [x{{[0-9]+|sp}}], x{{[0-9]+}}
- %1 = extractvalue [3 x <4 x i16>] %b, 0
- %2 = extractvalue [3 x <4 x i16>] %b, 1
- %3 = extractvalue [3 x <4 x i16>] %b, 2
- %4 = bitcast i16* %a to i8*
- %5 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %4, <4 x i16> %1, <4 x i16> %2, <4 x i16> %3, i32 3, i32 2)
- %6 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %5, 0
- %7 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %5, 1
- %8 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %5, 2
- %9 = insertvalue { [3 x <4 x i16>] } undef, <4 x i16> %6, 0, 0
- %10 = insertvalue { [3 x <4 x i16>] } %9, <4 x i16> %7, 0, 1
- %11 = insertvalue { [3 x <4 x i16>] } %10, <4 x i16> %8, 0, 2
- %tmp1 = getelementptr i16* %a, i32 %inc
- store i16* %tmp1, i16** %ptr
- ret { [3 x <4 x i16>] } %11
-}
-
-define { [4 x <2 x i32>] } @test_vld4_lane_fx_update(i32* readonly %a, [4 x <2 x i32>] %b, i32** %ptr) {
-; CHECK-LABEL: test_vld4_lane_fx_update
-; CHECK: ld4 {v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s}[1], [x{{[0-9]+|sp}}], #16
- %1 = extractvalue [4 x <2 x i32>] %b, 0
- %2 = extractvalue [4 x <2 x i32>] %b, 1
- %3 = extractvalue [4 x <2 x i32>] %b, 2
- %4 = extractvalue [4 x <2 x i32>] %b, 3
- %5 = bitcast i32* %a to i8*
- %6 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %5, <2 x i32> %1, <2 x i32> %2, <2 x i32> %3, <2 x i32> %4, i32 1, i32 4)
- %7 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %6, 0
- %8 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %6, 1
- %9 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %6, 2
- %10 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %6, 3
- %11 = insertvalue { [4 x <2 x i32>] } undef, <2 x i32> %7, 0, 0
- %12 = insertvalue { [4 x <2 x i32>] } %11, <2 x i32> %8, 0, 1
- %13 = insertvalue { [4 x <2 x i32>] } %12, <2 x i32> %9, 0, 2
- %14 = insertvalue { [4 x <2 x i32>] } %13, <2 x i32> %10, 0, 3
- %tmp1 = getelementptr i32* %a, i32 4
- store i32* %tmp1, i32** %ptr
- ret { [4 x <2 x i32>] } %14
-}
-
-define { [4 x <2 x double>] } @test_vld4_lane_reg_update(double* readonly %a, [4 x <2 x double>] %b, double** %ptr, i32 %inc) {
-; CHECK-LABEL: test_vld4_lane_reg_update
-; CHECK: ld4 {v{{[0-9]+}}.d, v{{[0-9]+}}.d, v{{[0-9]+}}.d, v{{[0-9]+}}.d}[1], [x{{[0-9]+|sp}}], x{{[0-9]+}}
- %1 = extractvalue [4 x <2 x double>] %b, 0
- %2 = extractvalue [4 x <2 x double>] %b, 1
- %3 = extractvalue [4 x <2 x double>] %b, 2
- %4 = extractvalue [4 x <2 x double>] %b, 3
- %5 = bitcast double* %a to i8*
- %6 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %5, <2 x double> %1, <2 x double> %2, <2 x double> %3, <2 x double> %4, i32 1, i32 8)
- %7 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %6, 0
- %8 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %6, 1
- %9 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %6, 2
- %10 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %6, 3
- %11 = insertvalue { [4 x <2 x double>] } undef, <2 x double> %7, 0, 0
- %12 = insertvalue { [4 x <2 x double>] } %11, <2 x double> %8, 0, 1
- %13 = insertvalue { [4 x <2 x double>] } %12, <2 x double> %9, 0, 2
- %14 = insertvalue { [4 x <2 x double>] } %13, <2 x double> %10, 0, 3
- %tmp1 = getelementptr double* %a, i32 %inc
- store double* %tmp1, double** %ptr
- ret { [4 x <2 x double>] } %14
-}
-
-define void @test_vst2_lane_fx_update(i8* %a, [2 x <8 x i8>] %b, i8** %ptr) {
-; CHECK-LABEL: test_vst2_lane_fx_update
-; CHECK: st2 {v{{[0-9]+}}.b, v{{[0-9]+}}.b}[7], [x{{[0-9]+|sp}}], #2
- %1 = extractvalue [2 x <8 x i8>] %b, 0
- %2 = extractvalue [2 x <8 x i8>] %b, 1
- call void @llvm.arm.neon.vst2lane.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, i32 7, i32 1)
- %tmp1 = getelementptr i8* %a, i32 2
- store i8* %tmp1, i8** %ptr
- ret void
-}
-
-define void @test_vst2_lane_reg_update(i32* %a, [2 x <2 x i32>] %b.coerce, i32** %ptr, i32 %inc) {
-; CHECK-LABEL: test_vst2_lane_reg_update
-; CHECK: st2 {v{{[0-9]+}}.s, v{{[0-9]+}}.s}[1], [x{{[0-9]+|sp}}], x{{[0-9]+}}
- %1 = extractvalue [2 x <2 x i32>] %b.coerce, 0
- %2 = extractvalue [2 x <2 x i32>] %b.coerce, 1
- %3 = bitcast i32* %a to i8*
- tail call void @llvm.arm.neon.vst2lane.v2i32(i8* %3, <2 x i32> %1, <2 x i32> %2, i32 1, i32 4)
- %tmp1 = getelementptr i32* %a, i32 %inc
- store i32* %tmp1, i32** %ptr
- ret void
-}
-
-define void @test_vst3_lane_fx_update(float* %a, [3 x <4 x float>] %b, float** %ptr) {
-; CHECK-LABEL: test_vst3_lane_fx_update
-; CHECK: st3 {v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s}[3], [x{{[0-9]+|sp}}], #12
- %1 = extractvalue [3 x <4 x float>] %b, 0
- %2 = extractvalue [3 x <4 x float>] %b, 1
- %3 = extractvalue [3 x <4 x float>] %b, 2
- %4 = bitcast float* %a to i8*
- call void @llvm.arm.neon.vst3lane.v4f32(i8* %4, <4 x float> %1, <4 x float> %2, <4 x float> %3, i32 3, i32 4)
- %tmp1 = getelementptr float* %a, i32 3
- store float* %tmp1, float** %ptr
- ret void
-}
-
-; Function Attrs: nounwind
-define void @test_vst3_lane_reg_update(i16* %a, [3 x <4 x i16>] %b, i16** %ptr, i32 %inc) {
-; CHECK-LABEL: test_vst3_lane_reg_update
-; CHECK: st3 {v{{[0-9]+}}.h, v{{[0-9]+}}.h, v{{[0-9]+}}.h}[3], [x{{[0-9]+|sp}}], x{{[0-9]+}}
- %1 = extractvalue [3 x <4 x i16>] %b, 0
- %2 = extractvalue [3 x <4 x i16>] %b, 1
- %3 = extractvalue [3 x <4 x i16>] %b, 2
- %4 = bitcast i16* %a to i8*
- tail call void @llvm.arm.neon.vst3lane.v4i16(i8* %4, <4 x i16> %1, <4 x i16> %2, <4 x i16> %3, i32 3, i32 2)
- %tmp1 = getelementptr i16* %a, i32 %inc
- store i16* %tmp1, i16** %ptr
- ret void
-}
-
-define void @test_vst4_lane_fx_update(double* %a, [4 x <2 x double>] %b.coerce, double** %ptr) {
-; CHECK-LABEL: test_vst4_lane_fx_update
-; CHECK: st4 {v{{[0-9]+}}.d, v{{[0-9]+}}.d, v{{[0-9]+}}.d, v{{[0-9]+}}.d}[1], [x{{[0-9]+|sp}}], #32
- %1 = extractvalue [4 x <2 x double>] %b.coerce, 0
- %2 = extractvalue [4 x <2 x double>] %b.coerce, 1
- %3 = extractvalue [4 x <2 x double>] %b.coerce, 2
- %4 = extractvalue [4 x <2 x double>] %b.coerce, 3
- %5 = bitcast double* %a to i8*
- tail call void @llvm.arm.neon.vst4lane.v2f64(i8* %5, <2 x double> %1, <2 x double> %2, <2 x double> %3, <2 x double> %4, i32 1, i32 8)
- %tmp1 = getelementptr double* %a, i32 4
- store double* %tmp1, double** %ptr
- ret void
-}
-
-
-define void @test_vst4_lane_reg_update(float* %a, [4 x <2 x float>] %b.coerce, float** %ptr, i32 %inc) {
-; CHECK-LABEL: test_vst4_lane_reg_update
-; CHECK: st4 {v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s}[1], [x{{[0-9]+|sp}}], x{{[0-9]+}}
- %1 = extractvalue [4 x <2 x float>] %b.coerce, 0
- %2 = extractvalue [4 x <2 x float>] %b.coerce, 1
- %3 = extractvalue [4 x <2 x float>] %b.coerce, 2
- %4 = extractvalue [4 x <2 x float>] %b.coerce, 3
- %5 = bitcast float* %a to i8*
- tail call void @llvm.arm.neon.vst4lane.v2f32(i8* %5, <2 x float> %1, <2 x float> %2, <2 x float> %3, <2 x float> %4, i32 1, i32 4)
- %tmp1 = getelementptr float* %a, i32 %inc
- store float* %tmp1, float** %ptr
- ret void
-}
-
-declare { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
-declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32)
-declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32)
-declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
-declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
-declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32)
-declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)
-declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
-declare void @llvm.arm.neon.vst2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32)
-declare void @llvm.arm.neon.vst3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32)
-declare void @llvm.arm.neon.vst4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)
diff --git a/test/CodeGen/AArch64/neon-simd-shift.ll b/test/CodeGen/AArch64/neon-simd-shift.ll
deleted file mode 100644
index fd76265..0000000
--- a/test/CodeGen/AArch64/neon-simd-shift.ll
+++ /dev/null
@@ -1,1556 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-define <8 x i8> @test_vshr_n_s8(<8 x i8> %a) {
-; CHECK: test_vshr_n_s8
-; CHECK: sshr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
- %vshr_n = ashr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
- ret <8 x i8> %vshr_n
-}
-
-define <4 x i16> @test_vshr_n_s16(<4 x i16> %a) {
-; CHECK: test_vshr_n_s16
-; CHECK: sshr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
- %vshr_n = ashr <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3>
- ret <4 x i16> %vshr_n
-}
-
-define <2 x i32> @test_vshr_n_s32(<2 x i32> %a) {
-; CHECK: test_vshr_n_s32
-; CHECK: sshr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
- %vshr_n = ashr <2 x i32> %a, <i32 3, i32 3>
- ret <2 x i32> %vshr_n
-}
-
-define <16 x i8> @test_vshrq_n_s8(<16 x i8> %a) {
-; CHECK: test_vshrq_n_s8
-; CHECK: sshr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
- %vshr_n = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
- ret <16 x i8> %vshr_n
-}
-
-define <8 x i16> @test_vshrq_n_s16(<8 x i16> %a) {
-; CHECK: test_vshrq_n_s16
-; CHECK: sshr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
- %vshr_n = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
- ret <8 x i16> %vshr_n
-}
-
-define <4 x i32> @test_vshrq_n_s32(<4 x i32> %a) {
-; CHECK: test_vshrq_n_s32
-; CHECK: sshr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
- %vshr_n = ashr <4 x i32> %a, <i32 3, i32 3, i32 3, i32 3>
- ret <4 x i32> %vshr_n
-}
-
-define <2 x i64> @test_vshrq_n_s64(<2 x i64> %a) {
-; CHECK: test_vshrq_n_s64
-; CHECK: sshr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
- %vshr_n = ashr <2 x i64> %a, <i64 3, i64 3>
- ret <2 x i64> %vshr_n
-}
-
-define <8 x i8> @test_vshr_n_u8(<8 x i8> %a) {
-; CHECK: test_vshr_n_u8
-; CHECK: ushr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
- %vshr_n = lshr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
- ret <8 x i8> %vshr_n
-}
-
-define <4 x i16> @test_vshr_n_u16(<4 x i16> %a) {
-; CHECK: test_vshr_n_u16
-; CHECK: ushr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
- %vshr_n = lshr <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3>
- ret <4 x i16> %vshr_n
-}
-
-define <2 x i32> @test_vshr_n_u32(<2 x i32> %a) {
-; CHECK: test_vshr_n_u32
-; CHECK: ushr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
- %vshr_n = lshr <2 x i32> %a, <i32 3, i32 3>
- ret <2 x i32> %vshr_n
-}
-
-define <16 x i8> @test_vshrq_n_u8(<16 x i8> %a) {
-; CHECK: test_vshrq_n_u8
-; CHECK: ushr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
- %vshr_n = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
- ret <16 x i8> %vshr_n
-}
-
-define <8 x i16> @test_vshrq_n_u16(<8 x i16> %a) {
-; CHECK: test_vshrq_n_u16
-; CHECK: ushr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
- %vshr_n = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
- ret <8 x i16> %vshr_n
-}
-
-define <4 x i32> @test_vshrq_n_u32(<4 x i32> %a) {
-; CHECK: test_vshrq_n_u32
-; CHECK: ushr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
- %vshr_n = lshr <4 x i32> %a, <i32 3, i32 3, i32 3, i32 3>
- ret <4 x i32> %vshr_n
-}
-
-define <2 x i64> @test_vshrq_n_u64(<2 x i64> %a) {
-; CHECK: test_vshrq_n_u64
-; CHECK: ushr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
- %vshr_n = lshr <2 x i64> %a, <i64 3, i64 3>
- ret <2 x i64> %vshr_n
-}
-
-define <8 x i8> @test_vsra_n_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vsra_n_s8
-; CHECK: ssra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
- %vsra_n = ashr <8 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
- %1 = add <8 x i8> %vsra_n, %a
- ret <8 x i8> %1
-}
-
-define <4 x i16> @test_vsra_n_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vsra_n_s16
-; CHECK: ssra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
- %vsra_n = ashr <4 x i16> %b, <i16 3, i16 3, i16 3, i16 3>
- %1 = add <4 x i16> %vsra_n, %a
- ret <4 x i16> %1
-}
-
-define <2 x i32> @test_vsra_n_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vsra_n_s32
-; CHECK: ssra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
- %vsra_n = ashr <2 x i32> %b, <i32 3, i32 3>
- %1 = add <2 x i32> %vsra_n, %a
- ret <2 x i32> %1
-}
-
-define <16 x i8> @test_vsraq_n_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vsraq_n_s8
-; CHECK: ssra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
- %vsra_n = ashr <16 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
- %1 = add <16 x i8> %vsra_n, %a
- ret <16 x i8> %1
-}
-
-define <8 x i16> @test_vsraq_n_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsraq_n_s16
-; CHECK: ssra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
- %vsra_n = ashr <8 x i16> %b, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
- %1 = add <8 x i16> %vsra_n, %a
- ret <8 x i16> %1
-}
-
-define <4 x i32> @test_vsraq_n_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vsraq_n_s32
-; CHECK: ssra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
- %vsra_n = ashr <4 x i32> %b, <i32 3, i32 3, i32 3, i32 3>
- %1 = add <4 x i32> %vsra_n, %a
- ret <4 x i32> %1
-}
-
-define <2 x i64> @test_vsraq_n_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vsraq_n_s64
-; CHECK: ssra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
- %vsra_n = ashr <2 x i64> %b, <i64 3, i64 3>
- %1 = add <2 x i64> %vsra_n, %a
- ret <2 x i64> %1
-}
-
-define <8 x i8> @test_vsra_n_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vsra_n_u8
-; CHECK: usra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
- %vsra_n = lshr <8 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
- %1 = add <8 x i8> %vsra_n, %a
- ret <8 x i8> %1
-}
-
-define <4 x i16> @test_vsra_n_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vsra_n_u16
-; CHECK: usra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
- %vsra_n = lshr <4 x i16> %b, <i16 3, i16 3, i16 3, i16 3>
- %1 = add <4 x i16> %vsra_n, %a
- ret <4 x i16> %1
-}
-
-define <2 x i32> @test_vsra_n_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vsra_n_u32
-; CHECK: usra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
- %vsra_n = lshr <2 x i32> %b, <i32 3, i32 3>
- %1 = add <2 x i32> %vsra_n, %a
- ret <2 x i32> %1
-}
-
-define <16 x i8> @test_vsraq_n_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vsraq_n_u8
-; CHECK: usra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
- %vsra_n = lshr <16 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
- %1 = add <16 x i8> %vsra_n, %a
- ret <16 x i8> %1
-}
-
-define <8 x i16> @test_vsraq_n_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsraq_n_u16
-; CHECK: usra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
- %vsra_n = lshr <8 x i16> %b, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
- %1 = add <8 x i16> %vsra_n, %a
- ret <8 x i16> %1
-}
-
-define <4 x i32> @test_vsraq_n_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vsraq_n_u32
-; CHECK: usra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
- %vsra_n = lshr <4 x i32> %b, <i32 3, i32 3, i32 3, i32 3>
- %1 = add <4 x i32> %vsra_n, %a
- ret <4 x i32> %1
-}
-
-define <2 x i64> @test_vsraq_n_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vsraq_n_u64
-; CHECK: usra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
- %vsra_n = lshr <2 x i64> %b, <i64 3, i64 3>
- %1 = add <2 x i64> %vsra_n, %a
- ret <2 x i64> %1
-}
-
-define <8 x i8> @test_vrshr_n_s8(<8 x i8> %a) {
-; CHECK: test_vrshr_n_s8
-; CHECK: srshr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
- %vrshr_n = tail call <8 x i8> @llvm.aarch64.neon.vsrshr.v8i8(<8 x i8> %a, i32 3)
- ret <8 x i8> %vrshr_n
-}
-
-
-define <4 x i16> @test_vrshr_n_s16(<4 x i16> %a) {
-; CHECK: test_vrshr_n_s16
-; CHECK: srshr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
- %vrshr_n = tail call <4 x i16> @llvm.aarch64.neon.vsrshr.v4i16(<4 x i16> %a, i32 3)
- ret <4 x i16> %vrshr_n
-}
-
-
-define <2 x i32> @test_vrshr_n_s32(<2 x i32> %a) {
-; CHECK: test_vrshr_n_s32
-; CHECK: srshr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
- %vrshr_n = tail call <2 x i32> @llvm.aarch64.neon.vsrshr.v2i32(<2 x i32> %a, i32 3)
- ret <2 x i32> %vrshr_n
-}
-
-
-define <16 x i8> @test_vrshrq_n_s8(<16 x i8> %a) {
-; CHECK: test_vrshrq_n_s8
-; CHECK: srshr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
- %vrshr_n = tail call <16 x i8> @llvm.aarch64.neon.vsrshr.v16i8(<16 x i8> %a, i32 3)
- ret <16 x i8> %vrshr_n
-}
-
-
-define <8 x i16> @test_vrshrq_n_s16(<8 x i16> %a) {
-; CHECK: test_vrshrq_n_s16
-; CHECK: srshr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
- %vrshr_n = tail call <8 x i16> @llvm.aarch64.neon.vsrshr.v8i16(<8 x i16> %a, i32 3)
- ret <8 x i16> %vrshr_n
-}
-
-
-define <4 x i32> @test_vrshrq_n_s32(<4 x i32> %a) {
-; CHECK: test_vrshrq_n_s32
-; CHECK: srshr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
- %vrshr_n = tail call <4 x i32> @llvm.aarch64.neon.vsrshr.v4i32(<4 x i32> %a, i32 3)
- ret <4 x i32> %vrshr_n
-}
-
-
-define <2 x i64> @test_vrshrq_n_s64(<2 x i64> %a) {
-; CHECK: test_vrshrq_n_s64
-; CHECK: srshr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
- %vrshr_n = tail call <2 x i64> @llvm.aarch64.neon.vsrshr.v2i64(<2 x i64> %a, i32 3)
- ret <2 x i64> %vrshr_n
-}
-
-
-define <8 x i8> @test_vrshr_n_u8(<8 x i8> %a) {
-; CHECK: test_vrshr_n_u8
-; CHECK: urshr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
- %vrshr_n = tail call <8 x i8> @llvm.aarch64.neon.vurshr.v8i8(<8 x i8> %a, i32 3)
- ret <8 x i8> %vrshr_n
-}
-
-
-define <4 x i16> @test_vrshr_n_u16(<4 x i16> %a) {
-; CHECK: test_vrshr_n_u16
-; CHECK: urshr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
- %vrshr_n = tail call <4 x i16> @llvm.aarch64.neon.vurshr.v4i16(<4 x i16> %a, i32 3)
- ret <4 x i16> %vrshr_n
-}
-
-
-define <2 x i32> @test_vrshr_n_u32(<2 x i32> %a) {
-; CHECK: test_vrshr_n_u32
-; CHECK: urshr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
- %vrshr_n = tail call <2 x i32> @llvm.aarch64.neon.vurshr.v2i32(<2 x i32> %a, i32 3)
- ret <2 x i32> %vrshr_n
-}
-
-
-define <16 x i8> @test_vrshrq_n_u8(<16 x i8> %a) {
-; CHECK: test_vrshrq_n_u8
-; CHECK: urshr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
- %vrshr_n = tail call <16 x i8> @llvm.aarch64.neon.vurshr.v16i8(<16 x i8> %a, i32 3)
- ret <16 x i8> %vrshr_n
-}
-
-
-define <8 x i16> @test_vrshrq_n_u16(<8 x i16> %a) {
-; CHECK: test_vrshrq_n_u16
-; CHECK: urshr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
- %vrshr_n = tail call <8 x i16> @llvm.aarch64.neon.vurshr.v8i16(<8 x i16> %a, i32 3)
- ret <8 x i16> %vrshr_n
-}
-
-
-define <4 x i32> @test_vrshrq_n_u32(<4 x i32> %a) {
-; CHECK: test_vrshrq_n_u32
-; CHECK: urshr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
- %vrshr_n = tail call <4 x i32> @llvm.aarch64.neon.vurshr.v4i32(<4 x i32> %a, i32 3)
- ret <4 x i32> %vrshr_n
-}
-
-
-define <2 x i64> @test_vrshrq_n_u64(<2 x i64> %a) {
-; CHECK: test_vrshrq_n_u64
-; CHECK: urshr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
- %vrshr_n = tail call <2 x i64> @llvm.aarch64.neon.vurshr.v2i64(<2 x i64> %a, i32 3)
- ret <2 x i64> %vrshr_n
-}
-
-
-define <8 x i8> @test_vrsra_n_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vrsra_n_s8
-; CHECK: srsra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
- %1 = tail call <8 x i8> @llvm.aarch64.neon.vsrshr.v8i8(<8 x i8> %b, i32 3)
- %vrsra_n = add <8 x i8> %1, %a
- ret <8 x i8> %vrsra_n
-}
-
-define <4 x i16> @test_vrsra_n_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vrsra_n_s16
-; CHECK: srsra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
- %1 = tail call <4 x i16> @llvm.aarch64.neon.vsrshr.v4i16(<4 x i16> %b, i32 3)
- %vrsra_n = add <4 x i16> %1, %a
- ret <4 x i16> %vrsra_n
-}
-
-define <2 x i32> @test_vrsra_n_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vrsra_n_s32
-; CHECK: srsra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
- %1 = tail call <2 x i32> @llvm.aarch64.neon.vsrshr.v2i32(<2 x i32> %b, i32 3)
- %vrsra_n = add <2 x i32> %1, %a
- ret <2 x i32> %vrsra_n
-}
-
-define <16 x i8> @test_vrsraq_n_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vrsraq_n_s8
-; CHECK: srsra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
- %1 = tail call <16 x i8> @llvm.aarch64.neon.vsrshr.v16i8(<16 x i8> %b, i32 3)
- %vrsra_n = add <16 x i8> %1, %a
- ret <16 x i8> %vrsra_n
-}
-
-define <8 x i16> @test_vrsraq_n_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vrsraq_n_s16
-; CHECK: srsra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
- %1 = tail call <8 x i16> @llvm.aarch64.neon.vsrshr.v8i16(<8 x i16> %b, i32 3)
- %vrsra_n = add <8 x i16> %1, %a
- ret <8 x i16> %vrsra_n
-}
-
-define <4 x i32> @test_vrsraq_n_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vrsraq_n_s32
-; CHECK: srsra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
- %1 = tail call <4 x i32> @llvm.aarch64.neon.vsrshr.v4i32(<4 x i32> %b, i32 3)
- %vrsra_n = add <4 x i32> %1, %a
- ret <4 x i32> %vrsra_n
-}
-
-define <2 x i64> @test_vrsraq_n_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vrsraq_n_s64
-; CHECK: srsra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
- %1 = tail call <2 x i64> @llvm.aarch64.neon.vsrshr.v2i64(<2 x i64> %b, i32 3)
- %vrsra_n = add <2 x i64> %1, %a
- ret <2 x i64> %vrsra_n
-}
-
-define <8 x i8> @test_vrsra_n_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vrsra_n_u8
-; CHECK: ursra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
- %1 = tail call <8 x i8> @llvm.aarch64.neon.vurshr.v8i8(<8 x i8> %b, i32 3)
- %vrsra_n = add <8 x i8> %1, %a
- ret <8 x i8> %vrsra_n
-}
-
-define <4 x i16> @test_vrsra_n_u16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vrsra_n_u16
-; CHECK: ursra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
- %1 = tail call <4 x i16> @llvm.aarch64.neon.vurshr.v4i16(<4 x i16> %b, i32 3)
- %vrsra_n = add <4 x i16> %1, %a
- ret <4 x i16> %vrsra_n
-}
-
-define <2 x i32> @test_vrsra_n_u32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vrsra_n_u32
-; CHECK: ursra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
- %1 = tail call <2 x i32> @llvm.aarch64.neon.vurshr.v2i32(<2 x i32> %b, i32 3)
- %vrsra_n = add <2 x i32> %1, %a
- ret <2 x i32> %vrsra_n
-}
-
-define <16 x i8> @test_vrsraq_n_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vrsraq_n_u8
-; CHECK: ursra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
- %1 = tail call <16 x i8> @llvm.aarch64.neon.vurshr.v16i8(<16 x i8> %b, i32 3)
- %vrsra_n = add <16 x i8> %1, %a
- ret <16 x i8> %vrsra_n
-}
-
-define <8 x i16> @test_vrsraq_n_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vrsraq_n_u16
-; CHECK: ursra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
- %1 = tail call <8 x i16> @llvm.aarch64.neon.vurshr.v8i16(<8 x i16> %b, i32 3)
- %vrsra_n = add <8 x i16> %1, %a
- ret <8 x i16> %vrsra_n
-}
-
-define <4 x i32> @test_vrsraq_n_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vrsraq_n_u32
-; CHECK: ursra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
- %1 = tail call <4 x i32> @llvm.aarch64.neon.vurshr.v4i32(<4 x i32> %b, i32 3)
- %vrsra_n = add <4 x i32> %1, %a
- ret <4 x i32> %vrsra_n
-}
-
-define <2 x i64> @test_vrsraq_n_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vrsraq_n_u64
-; CHECK: ursra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
- %1 = tail call <2 x i64> @llvm.aarch64.neon.vurshr.v2i64(<2 x i64> %b, i32 3)
- %vrsra_n = add <2 x i64> %1, %a
- ret <2 x i64> %vrsra_n
-}
-
-define <8 x i8> @test_vsri_n_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vsri_n_s8
-; CHECK: sri {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
- %vsri_n = tail call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
- ret <8 x i8> %vsri_n
-}
-
-
-define <4 x i16> @test_vsri_n_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vsri_n_s16
-; CHECK: sri {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
- %vsri = tail call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> %a, <4 x i16> %b, i32 3)
- ret <4 x i16> %vsri
-}
-
-
-define <2 x i32> @test_vsri_n_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vsri_n_s32
-; CHECK: sri {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
- %vsri = tail call <2 x i32> @llvm.aarch64.neon.vsri.v2i32(<2 x i32> %a, <2 x i32> %b, i32 3)
- ret <2 x i32> %vsri
-}
-
-
-define <16 x i8> @test_vsriq_n_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vsriq_n_s8
-; CHECK: sri {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
- %vsri_n = tail call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
- ret <16 x i8> %vsri_n
-}
-
-
-define <8 x i16> @test_vsriq_n_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsriq_n_s16
-; CHECK: sri {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
- %vsri = tail call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> %a, <8 x i16> %b, i32 3)
- ret <8 x i16> %vsri
-}
-
-
-define <4 x i32> @test_vsriq_n_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vsriq_n_s32
-; CHECK: sri {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
- %vsri = tail call <4 x i32> @llvm.aarch64.neon.vsri.v4i32(<4 x i32> %a, <4 x i32> %b, i32 3)
- ret <4 x i32> %vsri
-}
-
-
-define <2 x i64> @test_vsriq_n_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vsriq_n_s64
-; CHECK: sri {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
- %vsri = tail call <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64> %a, <2 x i64> %b, i32 3)
- ret <2 x i64> %vsri
-}
-
-define <8 x i8> @test_vsri_n_p8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vsri_n_p8
-; CHECK: sri {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
- %vsri_n = tail call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
- ret <8 x i8> %vsri_n
-}
-
-define <4 x i16> @test_vsri_n_p16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vsri_n_p16
-; CHECK: sri {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #15
- %vsri = tail call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> %a, <4 x i16> %b, i32 15)
- ret <4 x i16> %vsri
-}
-
-define <16 x i8> @test_vsriq_n_p8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vsriq_n_p8
-; CHECK: sri {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
- %vsri_n = tail call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
- ret <16 x i8> %vsri_n
-}
-
-define <8 x i16> @test_vsriq_n_p16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsriq_n_p16
-; CHECK: sri {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #15
- %vsri = tail call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> %a, <8 x i16> %b, i32 15)
- ret <8 x i16> %vsri
-}
-
-define <8 x i8> @test_vsli_n_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vsli_n_s8
-; CHECK: sli {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
- %vsli_n = tail call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
- ret <8 x i8> %vsli_n
-}
-
-define <4 x i16> @test_vsli_n_s16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vsli_n_s16
-; CHECK: sli {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
- %vsli = tail call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> %a, <4 x i16> %b, i32 3)
- ret <4 x i16> %vsli
-}
-
-define <2 x i32> @test_vsli_n_s32(<2 x i32> %a, <2 x i32> %b) {
-; CHECK: test_vsli_n_s32
-; CHECK: sli {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
- %vsli = tail call <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32> %a, <2 x i32> %b, i32 3)
- ret <2 x i32> %vsli
-}
-
-define <16 x i8> @test_vsliq_n_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vsliq_n_s8
-; CHECK: sli {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
- %vsli_n = tail call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
- ret <16 x i8> %vsli_n
-}
-
-define <8 x i16> @test_vsliq_n_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsliq_n_s16
-; CHECK: sli {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
- %vsli = tail call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> %a, <8 x i16> %b, i32 3)
- ret <8 x i16> %vsli
-}
-
-define <4 x i32> @test_vsliq_n_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK: test_vsliq_n_s32
-; CHECK: sli {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
- %vsli = tail call <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32> %a, <4 x i32> %b, i32 3)
- ret <4 x i32> %vsli
-}
-
-define <2 x i64> @test_vsliq_n_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK: test_vsliq_n_s64
-; CHECK: sli {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
- %vsli = tail call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> %a, <2 x i64> %b, i32 3)
- ret <2 x i64> %vsli
-}
-
-define <8 x i8> @test_vsli_n_p8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vsli_n_p8
-; CHECK: sli {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
- %vsli_n = tail call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
- ret <8 x i8> %vsli_n
-}
-
-define <4 x i16> @test_vsli_n_p16(<4 x i16> %a, <4 x i16> %b) {
-; CHECK: test_vsli_n_p16
-; CHECK: sli {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #15
- %vsli = tail call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> %a, <4 x i16> %b, i32 15)
- ret <4 x i16> %vsli
-}
-
-define <16 x i8> @test_vsliq_n_p8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vsliq_n_p8
-; CHECK: sli {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
- %vsli_n = tail call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
- ret <16 x i8> %vsli_n
-}
-
-define <8 x i16> @test_vsliq_n_p16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK: test_vsliq_n_p16
-; CHECK: sli {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #15
- %vsli = tail call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> %a, <8 x i16> %b, i32 15)
- ret <8 x i16> %vsli
-}
-
-define <8 x i8> @test_vqshl_n_s8(<8 x i8> %a) {
-; CHECK: test_vqshl_n_s8
-; CHECK: sqshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
- %vqshl = tail call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %a, <8 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
- ret <8 x i8> %vqshl
-}
-
-
-define <4 x i16> @test_vqshl_n_s16(<4 x i16> %a) {
-; CHECK: test_vqshl_n_s16
-; CHECK: sqshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
- %vqshl = tail call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> %a, <4 x i16> <i16 3, i16 3, i16 3, i16 3>)
- ret <4 x i16> %vqshl
-}
-
-
-define <2 x i32> @test_vqshl_n_s32(<2 x i32> %a) {
-; CHECK: test_vqshl_n_s32
-; CHECK: sqshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
- %vqshl = tail call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> %a, <2 x i32> <i32 3, i32 3>)
- ret <2 x i32> %vqshl
-}
-
-
-define <16 x i8> @test_vqshlq_n_s8(<16 x i8> %a) {
-; CHECK: test_vqshlq_n_s8
-; CHECK: sqshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
- %vqshl_n = tail call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %a, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
- ret <16 x i8> %vqshl_n
-}
-
-
-define <8 x i16> @test_vqshlq_n_s16(<8 x i16> %a) {
-; CHECK: test_vqshlq_n_s16
-; CHECK: sqshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
- %vqshl = tail call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> %a, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
- ret <8 x i16> %vqshl
-}
-
-
-define <4 x i32> @test_vqshlq_n_s32(<4 x i32> %a) {
-; CHECK: test_vqshlq_n_s32
-; CHECK: sqshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
- %vqshl = tail call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)
- ret <4 x i32> %vqshl
-}
-
-
-define <2 x i64> @test_vqshlq_n_s64(<2 x i64> %a) {
-; CHECK: test_vqshlq_n_s64
-; CHECK: sqshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
- %vqshl = tail call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> %a, <2 x i64> <i64 3, i64 3>)
- ret <2 x i64> %vqshl
-}
-
-
-define <8 x i8> @test_vqshl_n_u8(<8 x i8> %a) {
-; CHECK: test_vqshl_n_u8
-; CHECK: uqshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
- %vqshl_n = tail call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %a, <8 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
- ret <8 x i8> %vqshl_n
-}
-
-
-define <4 x i16> @test_vqshl_n_u16(<4 x i16> %a) {
-; CHECK: test_vqshl_n_u16
-; CHECK: uqshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
- %vqshl = tail call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> %a, <4 x i16> <i16 3, i16 3, i16 3, i16 3>)
- ret <4 x i16> %vqshl
-}
-
-
-define <2 x i32> @test_vqshl_n_u32(<2 x i32> %a) {
-; CHECK: test_vqshl_n_u32
-; CHECK: uqshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
- %vqshl = tail call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> %a, <2 x i32> <i32 3, i32 3>)
- ret <2 x i32> %vqshl
-}
-
-
-define <16 x i8> @test_vqshlq_n_u8(<16 x i8> %a) {
-; CHECK: test_vqshlq_n_u8
-; CHECK: uqshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
- %vqshl_n = tail call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %a, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
- ret <16 x i8> %vqshl_n
-}
-
-
-define <8 x i16> @test_vqshlq_n_u16(<8 x i16> %a) {
-; CHECK: test_vqshlq_n_u16
-; CHECK: uqshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
- %vqshl = tail call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> %a, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
- ret <8 x i16> %vqshl
-}
-
-
-define <4 x i32> @test_vqshlq_n_u32(<4 x i32> %a) {
-; CHECK: test_vqshlq_n_u32
-; CHECK: uqshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
- %vqshl = tail call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)
- ret <4 x i32> %vqshl
-}
-
-
-define <2 x i64> @test_vqshlq_n_u64(<2 x i64> %a) {
-; CHECK: test_vqshlq_n_u64
-; CHECK: uqshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
- %vqshl = tail call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> %a, <2 x i64> <i64 3, i64 3>)
- ret <2 x i64> %vqshl
-}
-
-define <8 x i8> @test_vqshlu_n_s8(<8 x i8> %a) {
-; CHECK: test_vqshlu_n_s8
-; CHECK: sqshlu {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
- %vqshlu = tail call <8 x i8> @llvm.aarch64.neon.vsqshlu.v8i8(<8 x i8> %a, i32 3)
- ret <8 x i8> %vqshlu
-}
-
-
-define <4 x i16> @test_vqshlu_n_s16(<4 x i16> %a) {
-; CHECK: test_vqshlu_n_s16
-; CHECK: sqshlu {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
- %vqshlu = tail call <4 x i16> @llvm.aarch64.neon.vsqshlu.v4i16(<4 x i16> %a, i32 3)
- ret <4 x i16> %vqshlu
-}
-
-
-define <2 x i32> @test_vqshlu_n_s32(<2 x i32> %a) {
-; CHECK: test_vqshlu_n_s32
-; CHECK: sqshlu {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
- %vqshlu = tail call <2 x i32> @llvm.aarch64.neon.vsqshlu.v2i32(<2 x i32> %a, i32 3)
- ret <2 x i32> %vqshlu
-}
-
-
-define <16 x i8> @test_vqshluq_n_s8(<16 x i8> %a) {
-; CHECK: test_vqshluq_n_s8
-; CHECK: sqshlu {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
- %vqshlu = tail call <16 x i8> @llvm.aarch64.neon.vsqshlu.v16i8(<16 x i8> %a, i32 3)
- ret <16 x i8> %vqshlu
-}
-
-
-define <8 x i16> @test_vqshluq_n_s16(<8 x i16> %a) {
-; CHECK: test_vqshluq_n_s16
-; CHECK: sqshlu {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
- %vqshlu = tail call <8 x i16> @llvm.aarch64.neon.vsqshlu.v8i16(<8 x i16> %a, i32 3)
- ret <8 x i16> %vqshlu
-}
-
-
-define <4 x i32> @test_vqshluq_n_s32(<4 x i32> %a) {
-; CHECK: test_vqshluq_n_s32
-; CHECK: sqshlu {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
- %vqshlu = tail call <4 x i32> @llvm.aarch64.neon.vsqshlu.v4i32(<4 x i32> %a, i32 3)
- ret <4 x i32> %vqshlu
-}
-
-
-define <2 x i64> @test_vqshluq_n_s64(<2 x i64> %a) {
-; CHECK: test_vqshluq_n_s64
-; CHECK: sqshlu {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
- %vqshlu = tail call <2 x i64> @llvm.aarch64.neon.vsqshlu.v2i64(<2 x i64> %a, i32 3)
- ret <2 x i64> %vqshlu
-}
-
-
-define <8 x i8> @test_vshrn_n_s16(<8 x i16> %a) {
-; CHECK: test_vshrn_n_s16
-; CHECK: shrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
- %1 = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
- %vshrn_n = trunc <8 x i16> %1 to <8 x i8>
- ret <8 x i8> %vshrn_n
-}
-
-define <4 x i16> @test_vshrn_n_s32(<4 x i32> %a) {
-; CHECK: test_vshrn_n_s32
-; CHECK: shrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
- %1 = ashr <4 x i32> %a, <i32 9, i32 9, i32 9, i32 9>
- %vshrn_n = trunc <4 x i32> %1 to <4 x i16>
- ret <4 x i16> %vshrn_n
-}
-
-define <2 x i32> @test_vshrn_n_s64(<2 x i64> %a) {
-; CHECK: test_vshrn_n_s64
-; CHECK: shrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
- %1 = ashr <2 x i64> %a, <i64 19, i64 19>
- %vshrn_n = trunc <2 x i64> %1 to <2 x i32>
- ret <2 x i32> %vshrn_n
-}
-
-define <8 x i8> @test_vshrn_n_u16(<8 x i16> %a) {
-; CHECK: test_vshrn_n_u16
-; CHECK: shrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
- %1 = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
- %vshrn_n = trunc <8 x i16> %1 to <8 x i8>
- ret <8 x i8> %vshrn_n
-}
-
-define <4 x i16> @test_vshrn_n_u32(<4 x i32> %a) {
-; CHECK: test_vshrn_n_u32
-; CHECK: shrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
- %1 = lshr <4 x i32> %a, <i32 9, i32 9, i32 9, i32 9>
- %vshrn_n = trunc <4 x i32> %1 to <4 x i16>
- ret <4 x i16> %vshrn_n
-}
-
-define <2 x i32> @test_vshrn_n_u64(<2 x i64> %a) {
-; CHECK: test_vshrn_n_u64
-; CHECK: shrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
- %1 = lshr <2 x i64> %a, <i64 19, i64 19>
- %vshrn_n = trunc <2 x i64> %1 to <2 x i32>
- ret <2 x i32> %vshrn_n
-}
-
-define <16 x i8> @test_vshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
-; CHECK: test_vshrn_high_n_s16
-; CHECK: shrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
- %1 = ashr <8 x i16> %b, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
- %vshrn_n = trunc <8 x i16> %1 to <8 x i8>
- %2 = bitcast <8 x i8> %a to <1 x i64>
- %3 = bitcast <8 x i8> %vshrn_n to <1 x i64>
- %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
- %4 = bitcast <2 x i64> %shuffle.i to <16 x i8>
- ret <16 x i8> %4
-}
-
-define <8 x i16> @test_vshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
-; CHECK: test_vshrn_high_n_s32
-; CHECK: shrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
- %1 = ashr <4 x i32> %b, <i32 9, i32 9, i32 9, i32 9>
- %vshrn_n = trunc <4 x i32> %1 to <4 x i16>
- %2 = bitcast <4 x i16> %a to <1 x i64>
- %3 = bitcast <4 x i16> %vshrn_n to <1 x i64>
- %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
- %4 = bitcast <2 x i64> %shuffle.i to <8 x i16>
- ret <8 x i16> %4
-}
-
-define <4 x i32> @test_vshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
-; CHECK: test_vshrn_high_n_s64
-; CHECK: shrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
- %1 = bitcast <2 x i32> %a to <1 x i64>
- %2 = ashr <2 x i64> %b, <i64 19, i64 19>
- %vshrn_n = trunc <2 x i64> %2 to <2 x i32>
- %3 = bitcast <2 x i32> %vshrn_n to <1 x i64>
- %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
- %4 = bitcast <2 x i64> %shuffle.i to <4 x i32>
- ret <4 x i32> %4
-}
-
-define <16 x i8> @test_vshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) {
-; CHECK: test_vshrn_high_n_u16
-; CHECK: shrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
- %1 = lshr <8 x i16> %b, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
- %vshrn_n = trunc <8 x i16> %1 to <8 x i8>
- %2 = bitcast <8 x i8> %a to <1 x i64>
- %3 = bitcast <8 x i8> %vshrn_n to <1 x i64>
- %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
- %4 = bitcast <2 x i64> %shuffle.i to <16 x i8>
- ret <16 x i8> %4
-}
-
-define <8 x i16> @test_vshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) {
-; CHECK: test_vshrn_high_n_u32
-; CHECK: shrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
- %1 = lshr <4 x i32> %b, <i32 9, i32 9, i32 9, i32 9>
- %vshrn_n = trunc <4 x i32> %1 to <4 x i16>
- %2 = bitcast <4 x i16> %a to <1 x i64>
- %3 = bitcast <4 x i16> %vshrn_n to <1 x i64>
- %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
- %4 = bitcast <2 x i64> %shuffle.i to <8 x i16>
- ret <8 x i16> %4
-}
-
-define <4 x i32> @test_vshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) {
-; CHECK: test_vshrn_high_n_u64
-; CHECK: shrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
- %1 = bitcast <2 x i32> %a to <1 x i64>
- %2 = lshr <2 x i64> %b, <i64 19, i64 19>
- %vshrn_n = trunc <2 x i64> %2 to <2 x i32>
- %3 = bitcast <2 x i32> %vshrn_n to <1 x i64>
- %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
- %4 = bitcast <2 x i64> %shuffle.i to <4 x i32>
- ret <4 x i32> %4
-}
-
-define <8 x i8> @test_vqshrun_n_s16(<8 x i16> %a) {
-; CHECK: test_vqshrun_n_s16
-; CHECK: sqshrun {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
- %vqshrun = tail call <8 x i8> @llvm.aarch64.neon.vsqshrun.v8i8(<8 x i16> %a, i32 3)
- ret <8 x i8> %vqshrun
-}
-
-
-define <4 x i16> @test_vqshrun_n_s32(<4 x i32> %a) {
-; CHECK: test_vqshrun_n_s32
-; CHECK: sqshrun {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
- %vqshrun = tail call <4 x i16> @llvm.aarch64.neon.vsqshrun.v4i16(<4 x i32> %a, i32 9)
- ret <4 x i16> %vqshrun
-}
-
-define <2 x i32> @test_vqshrun_n_s64(<2 x i64> %a) {
-; CHECK: test_vqshrun_n_s64
-; CHECK: sqshrun {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
- %vqshrun = tail call <2 x i32> @llvm.aarch64.neon.vsqshrun.v2i32(<2 x i64> %a, i32 19)
- ret <2 x i32> %vqshrun
-}
-
-define <16 x i8> @test_vqshrun_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
-; CHECK: test_vqshrun_high_n_s16
-; CHECK: sqshrun2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
- %vqshrun = tail call <8 x i8> @llvm.aarch64.neon.vsqshrun.v8i8(<8 x i16> %b, i32 3)
- %1 = bitcast <8 x i8> %a to <1 x i64>
- %2 = bitcast <8 x i8> %vqshrun to <1 x i64>
- %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
- %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
- ret <16 x i8> %3
-}
-
-define <8 x i16> @test_vqshrun_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
-; CHECK: test_vqshrun_high_n_s32
-; CHECK: sqshrun2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
- %vqshrun = tail call <4 x i16> @llvm.aarch64.neon.vsqshrun.v4i16(<4 x i32> %b, i32 9)
- %1 = bitcast <4 x i16> %a to <1 x i64>
- %2 = bitcast <4 x i16> %vqshrun to <1 x i64>
- %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
- %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
- ret <8 x i16> %3
-}
-
-define <4 x i32> @test_vqshrun_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
-; CHECK: test_vqshrun_high_n_s64
-; CHECK: sqshrun2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
- %1 = bitcast <2 x i32> %a to <1 x i64>
- %vqshrun = tail call <2 x i32> @llvm.aarch64.neon.vsqshrun.v2i32(<2 x i64> %b, i32 19)
- %2 = bitcast <2 x i32> %vqshrun to <1 x i64>
- %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
- %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
- ret <4 x i32> %3
-}
-
-define <8 x i8> @test_vrshrn_n_s16(<8 x i16> %a) {
-; CHECK: test_vrshrn_n_s16
-; CHECK: rshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
- %vrshrn = tail call <8 x i8> @llvm.aarch64.neon.vrshrn.v8i8(<8 x i16> %a, i32 3)
- ret <8 x i8> %vrshrn
-}
-
-
-define <4 x i16> @test_vrshrn_n_s32(<4 x i32> %a) {
-; CHECK: test_vrshrn_n_s32
-; CHECK: rshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
- %vrshrn = tail call <4 x i16> @llvm.aarch64.neon.vrshrn.v4i16(<4 x i32> %a, i32 9)
- ret <4 x i16> %vrshrn
-}
-
-
-define <2 x i32> @test_vrshrn_n_s64(<2 x i64> %a) {
-; CHECK: test_vrshrn_n_s64
-; CHECK: rshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
- %vrshrn = tail call <2 x i32> @llvm.aarch64.neon.vrshrn.v2i32(<2 x i64> %a, i32 19)
- ret <2 x i32> %vrshrn
-}
-
-define <16 x i8> @test_vrshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
-; CHECK: test_vrshrn_high_n_s16
-; CHECK: rshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
- %vrshrn = tail call <8 x i8> @llvm.aarch64.neon.vrshrn.v8i8(<8 x i16> %b, i32 3)
- %1 = bitcast <8 x i8> %a to <1 x i64>
- %2 = bitcast <8 x i8> %vrshrn to <1 x i64>
- %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
- %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
- ret <16 x i8> %3
-}
-
-define <8 x i16> @test_vrshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
-; CHECK: test_vrshrn_high_n_s32
-; CHECK: rshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
- %vrshrn = tail call <4 x i16> @llvm.aarch64.neon.vrshrn.v4i16(<4 x i32> %b, i32 9)
- %1 = bitcast <4 x i16> %a to <1 x i64>
- %2 = bitcast <4 x i16> %vrshrn to <1 x i64>
- %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
- %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
- ret <8 x i16> %3
-}
-
-define <4 x i32> @test_vrshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
-; CHECK: test_vrshrn_high_n_s64
-; CHECK: rshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
- %1 = bitcast <2 x i32> %a to <1 x i64>
- %vrshrn = tail call <2 x i32> @llvm.aarch64.neon.vrshrn.v2i32(<2 x i64> %b, i32 19)
- %2 = bitcast <2 x i32> %vrshrn to <1 x i64>
- %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
- %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
- ret <4 x i32> %3
-}
-
-define <8 x i8> @test_vqrshrun_n_s16(<8 x i16> %a) {
-; CHECK: test_vqrshrun_n_s16
-; CHECK: sqrshrun {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
- %vqrshrun = tail call <8 x i8> @llvm.aarch64.neon.vsqrshrun.v8i8(<8 x i16> %a, i32 3)
- ret <8 x i8> %vqrshrun
-}
-
-define <4 x i16> @test_vqrshrun_n_s32(<4 x i32> %a) {
-; CHECK: test_vqrshrun_n_s32
-; CHECK: sqrshrun {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
- %vqrshrun = tail call <4 x i16> @llvm.aarch64.neon.vsqrshrun.v4i16(<4 x i32> %a, i32 9)
- ret <4 x i16> %vqrshrun
-}
-
-define <2 x i32> @test_vqrshrun_n_s64(<2 x i64> %a) {
-; CHECK: test_vqrshrun_n_s64
-; CHECK: sqrshrun {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
- %vqrshrun = tail call <2 x i32> @llvm.aarch64.neon.vsqrshrun.v2i32(<2 x i64> %a, i32 19)
- ret <2 x i32> %vqrshrun
-}
-
-define <16 x i8> @test_vqrshrun_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
-; CHECK: test_vqrshrun_high_n_s16
-; CHECK: sqrshrun2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
- %vqrshrun = tail call <8 x i8> @llvm.aarch64.neon.vsqrshrun.v8i8(<8 x i16> %b, i32 3)
- %1 = bitcast <8 x i8> %a to <1 x i64>
- %2 = bitcast <8 x i8> %vqrshrun to <1 x i64>
- %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
- %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
- ret <16 x i8> %3
-}
-
-define <8 x i16> @test_vqrshrun_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
-; CHECK: test_vqrshrun_high_n_s32
-; CHECK: sqrshrun2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
- %vqrshrun = tail call <4 x i16> @llvm.aarch64.neon.vsqrshrun.v4i16(<4 x i32> %b, i32 9)
- %1 = bitcast <4 x i16> %a to <1 x i64>
- %2 = bitcast <4 x i16> %vqrshrun to <1 x i64>
- %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
- %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
- ret <8 x i16> %3
-}
-
-define <4 x i32> @test_vqrshrun_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
-; CHECK: test_vqrshrun_high_n_s64
-; CHECK: sqrshrun2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
- %1 = bitcast <2 x i32> %a to <1 x i64>
- %vqrshrun = tail call <2 x i32> @llvm.aarch64.neon.vsqrshrun.v2i32(<2 x i64> %b, i32 19)
- %2 = bitcast <2 x i32> %vqrshrun to <1 x i64>
- %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
- %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
- ret <4 x i32> %3
-}
-
-define <8 x i8> @test_vqshrn_n_s16(<8 x i16> %a) {
-; CHECK: test_vqshrn_n_s16
-; CHECK: sqshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
- %vqshrn = tail call <8 x i8> @llvm.aarch64.neon.vsqshrn.v8i8(<8 x i16> %a, i32 3)
- ret <8 x i8> %vqshrn
-}
-
-
-define <4 x i16> @test_vqshrn_n_s32(<4 x i32> %a) {
-; CHECK: test_vqshrn_n_s32
-; CHECK: sqshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
- %vqshrn = tail call <4 x i16> @llvm.aarch64.neon.vsqshrn.v4i16(<4 x i32> %a, i32 9)
- ret <4 x i16> %vqshrn
-}
-
-
-define <2 x i32> @test_vqshrn_n_s64(<2 x i64> %a) {
-; CHECK: test_vqshrn_n_s64
-; CHECK: sqshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
- %vqshrn = tail call <2 x i32> @llvm.aarch64.neon.vsqshrn.v2i32(<2 x i64> %a, i32 19)
- ret <2 x i32> %vqshrn
-}
-
-
-define <8 x i8> @test_vqshrn_n_u16(<8 x i16> %a) {
-; CHECK: test_vqshrn_n_u16
-; CHECK: uqshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
- %vqshrn = tail call <8 x i8> @llvm.aarch64.neon.vuqshrn.v8i8(<8 x i16> %a, i32 3)
- ret <8 x i8> %vqshrn
-}
-
-
-define <4 x i16> @test_vqshrn_n_u32(<4 x i32> %a) {
-; CHECK: test_vqshrn_n_u32
-; CHECK: uqshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
- %vqshrn = tail call <4 x i16> @llvm.aarch64.neon.vuqshrn.v4i16(<4 x i32> %a, i32 9)
- ret <4 x i16> %vqshrn
-}
-
-
-define <2 x i32> @test_vqshrn_n_u64(<2 x i64> %a) {
-; CHECK: test_vqshrn_n_u64
-; CHECK: uqshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
- %vqshrn = tail call <2 x i32> @llvm.aarch64.neon.vuqshrn.v2i32(<2 x i64> %a, i32 19)
- ret <2 x i32> %vqshrn
-}
-
-
-define <16 x i8> @test_vqshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
-; CHECK: test_vqshrn_high_n_s16
-; CHECK: sqshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
- %vqshrn = tail call <8 x i8> @llvm.aarch64.neon.vsqshrn.v8i8(<8 x i16> %b, i32 3)
- %1 = bitcast <8 x i8> %a to <1 x i64>
- %2 = bitcast <8 x i8> %vqshrn to <1 x i64>
- %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
- %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
- ret <16 x i8> %3
-}
-
-define <8 x i16> @test_vqshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
-; CHECK: test_vqshrn_high_n_s32
-; CHECK: sqshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
- %vqshrn = tail call <4 x i16> @llvm.aarch64.neon.vsqshrn.v4i16(<4 x i32> %b, i32 9)
- %1 = bitcast <4 x i16> %a to <1 x i64>
- %2 = bitcast <4 x i16> %vqshrn to <1 x i64>
- %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
- %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
- ret <8 x i16> %3
-}
-
-define <4 x i32> @test_vqshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
-; CHECK: test_vqshrn_high_n_s64
-; CHECK: sqshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
- %1 = bitcast <2 x i32> %a to <1 x i64>
- %vqshrn = tail call <2 x i32> @llvm.aarch64.neon.vsqshrn.v2i32(<2 x i64> %b, i32 19)
- %2 = bitcast <2 x i32> %vqshrn to <1 x i64>
- %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
- %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
- ret <4 x i32> %3
-}
-
-define <16 x i8> @test_vqshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) {
-; CHECK: test_vqshrn_high_n_u16
-; CHECK: uqshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
- %vqshrn = tail call <8 x i8> @llvm.aarch64.neon.vuqshrn.v8i8(<8 x i16> %b, i32 3)
- %1 = bitcast <8 x i8> %a to <1 x i64>
- %2 = bitcast <8 x i8> %vqshrn to <1 x i64>
- %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
- %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
- ret <16 x i8> %3
-}
-
-define <8 x i16> @test_vqshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) {
-; CHECK: test_vqshrn_high_n_u32
-; CHECK: uqshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
- %vqshrn = tail call <4 x i16> @llvm.aarch64.neon.vuqshrn.v4i16(<4 x i32> %b, i32 9)
- %1 = bitcast <4 x i16> %a to <1 x i64>
- %2 = bitcast <4 x i16> %vqshrn to <1 x i64>
- %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
- %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
- ret <8 x i16> %3
-}
-
-define <4 x i32> @test_vqshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) {
-; CHECK: test_vqshrn_high_n_u64
-; CHECK: uqshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
- %1 = bitcast <2 x i32> %a to <1 x i64>
- %vqshrn = tail call <2 x i32> @llvm.aarch64.neon.vuqshrn.v2i32(<2 x i64> %b, i32 19)
- %2 = bitcast <2 x i32> %vqshrn to <1 x i64>
- %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
- %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
- ret <4 x i32> %3
-}
-
-define <8 x i8> @test_vqrshrn_n_s16(<8 x i16> %a) {
-; CHECK: test_vqrshrn_n_s16
-; CHECK: sqrshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
- %vqrshrn = tail call <8 x i8> @llvm.aarch64.neon.vsqrshrn.v8i8(<8 x i16> %a, i32 3)
- ret <8 x i8> %vqrshrn
-}
-
-
-define <4 x i16> @test_vqrshrn_n_s32(<4 x i32> %a) {
-; CHECK: test_vqrshrn_n_s32
-; CHECK: sqrshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
- %vqrshrn = tail call <4 x i16> @llvm.aarch64.neon.vsqrshrn.v4i16(<4 x i32> %a, i32 9)
- ret <4 x i16> %vqrshrn
-}
-
-
-define <2 x i32> @test_vqrshrn_n_s64(<2 x i64> %a) {
-; CHECK: test_vqrshrn_n_s64
-; CHECK: sqrshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
- %vqrshrn = tail call <2 x i32> @llvm.aarch64.neon.vsqrshrn.v2i32(<2 x i64> %a, i32 19)
- ret <2 x i32> %vqrshrn
-}
-
-
-define <8 x i8> @test_vqrshrn_n_u16(<8 x i16> %a) {
-; CHECK: test_vqrshrn_n_u16
-; CHECK: uqrshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
- %vqrshrn = tail call <8 x i8> @llvm.aarch64.neon.vuqrshrn.v8i8(<8 x i16> %a, i32 3)
- ret <8 x i8> %vqrshrn
-}
-
-
-define <4 x i16> @test_vqrshrn_n_u32(<4 x i32> %a) {
-; CHECK: test_vqrshrn_n_u32
-; CHECK: uqrshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
- %vqrshrn = tail call <4 x i16> @llvm.aarch64.neon.vuqrshrn.v4i16(<4 x i32> %a, i32 9)
- ret <4 x i16> %vqrshrn
-}
-
-
-define <2 x i32> @test_vqrshrn_n_u64(<2 x i64> %a) {
-; CHECK: test_vqrshrn_n_u64
-; CHECK: uqrshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
- %vqrshrn = tail call <2 x i32> @llvm.aarch64.neon.vuqrshrn.v2i32(<2 x i64> %a, i32 19)
- ret <2 x i32> %vqrshrn
-}
-
-
-define <16 x i8> @test_vqrshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
-; CHECK: test_vqrshrn_high_n_s16
-; CHECK: sqrshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
- %vqrshrn = tail call <8 x i8> @llvm.aarch64.neon.vsqrshrn.v8i8(<8 x i16> %b, i32 3)
- %1 = bitcast <8 x i8> %a to <1 x i64>
- %2 = bitcast <8 x i8> %vqrshrn to <1 x i64>
- %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
- %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
- ret <16 x i8> %3
-}
-
-define <8 x i16> @test_vqrshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
-; CHECK: test_vqrshrn_high_n_s32
-; CHECK: sqrshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
- %vqrshrn = tail call <4 x i16> @llvm.aarch64.neon.vsqrshrn.v4i16(<4 x i32> %b, i32 9)
- %1 = bitcast <4 x i16> %a to <1 x i64>
- %2 = bitcast <4 x i16> %vqrshrn to <1 x i64>
- %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
- %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
- ret <8 x i16> %3
-}
-
-define <4 x i32> @test_vqrshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
-; CHECK: test_vqrshrn_high_n_s64
-; CHECK: sqrshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
- %1 = bitcast <2 x i32> %a to <1 x i64>
- %vqrshrn = tail call <2 x i32> @llvm.aarch64.neon.vsqrshrn.v2i32(<2 x i64> %b, i32 19)
- %2 = bitcast <2 x i32> %vqrshrn to <1 x i64>
- %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
- %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
- ret <4 x i32> %3
-}
-
-define <16 x i8> @test_vqrshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) {
-; CHECK: test_vqrshrn_high_n_u16
-; CHECK: uqrshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
- %vqrshrn = tail call <8 x i8> @llvm.aarch64.neon.vuqrshrn.v8i8(<8 x i16> %b, i32 3)
- %1 = bitcast <8 x i8> %a to <1 x i64>
- %2 = bitcast <8 x i8> %vqrshrn to <1 x i64>
- %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
- %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
- ret <16 x i8> %3
-}
-
-define <8 x i16> @test_vqrshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) {
-; CHECK: test_vqrshrn_high_n_u32
-; CHECK: uqrshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
- %vqrshrn = tail call <4 x i16> @llvm.aarch64.neon.vuqrshrn.v4i16(<4 x i32> %b, i32 9)
- %1 = bitcast <4 x i16> %a to <1 x i64>
- %2 = bitcast <4 x i16> %vqrshrn to <1 x i64>
- %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
- %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
- ret <8 x i16> %3
-}
-
-define <4 x i32> @test_vqrshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) {
-; CHECK: test_vqrshrn_high_n_u64
-; CHECK: uqrshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
- %1 = bitcast <2 x i32> %a to <1 x i64>
- %vqrshrn = tail call <2 x i32> @llvm.aarch64.neon.vuqrshrn.v2i32(<2 x i64> %b, i32 19)
- %2 = bitcast <2 x i32> %vqrshrn to <1 x i64>
- %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
- %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
- ret <4 x i32> %3
-}
-
-define <2 x float> @test_vcvt_n_f32_s32(<2 x i32> %a) {
-; CHECK: test_vcvt_n_f32_s32
-; CHECK: scvtf {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31
- %vcvt = tail call <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> %a, i32 31)
- ret <2 x float> %vcvt
-}
-
-define <4 x float> @test_vcvtq_n_f32_s32(<4 x i32> %a) {
-; CHECK: test_vcvtq_n_f32_s32
-; CHECK: scvtf {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31
- %vcvt = tail call <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> %a, i32 31)
- ret <4 x float> %vcvt
-}
-
-define <2 x double> @test_vcvtq_n_f64_s64(<2 x i64> %a) {
-; CHECK: test_vcvtq_n_f64_s64
-; CHECK: scvtf {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #50
- %vcvt = tail call <2 x double> @llvm.arm.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64> %a, i32 50)
- ret <2 x double> %vcvt
-}
-
-define <2 x float> @test_vcvt_n_f32_u32(<2 x i32> %a) {
-; CHECK: test_vcvt_n_f32_u32
-; CHECK: ucvtf {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31
- %vcvt = tail call <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> %a, i32 31)
- ret <2 x float> %vcvt
-}
-
-define <4 x float> @test_vcvtq_n_f32_u32(<4 x i32> %a) {
-; CHECK: test_vcvtq_n_f32_u32
-; CHECK: ucvtf {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31
- %vcvt = tail call <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> %a, i32 31)
- ret <4 x float> %vcvt
-}
-
-define <2 x double> @test_vcvtq_n_f64_u64(<2 x i64> %a) {
-; CHECK: test_vcvtq_n_f64_u64
-; CHECK: ucvtf {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #50
- %vcvt = tail call <2 x double> @llvm.arm.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64> %a, i32 50)
- ret <2 x double> %vcvt
-}
-
-define <2 x i32> @test_vcvt_n_s32_f32(<2 x float> %a) {
-; CHECK: test_vcvt_n_s32_f32
-; CHECK: fcvtzs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31
- %vcvt = tail call <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> %a, i32 31)
- ret <2 x i32> %vcvt
-}
-
-define <4 x i32> @test_vcvtq_n_s32_f32(<4 x float> %a) {
-; CHECK: test_vcvtq_n_s32_f32
-; CHECK: fcvtzs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31
- %vcvt = tail call <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> %a, i32 31)
- ret <4 x i32> %vcvt
-}
-
-define <2 x i64> @test_vcvtq_n_s64_f64(<2 x double> %a) {
-; CHECK: test_vcvtq_n_s64_f64
-; CHECK: fcvtzs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #50
- %vcvt = tail call <2 x i64> @llvm.arm.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double> %a, i32 50)
- ret <2 x i64> %vcvt
-}
-
-define <2 x i32> @test_vcvt_n_u32_f32(<2 x float> %a) {
-; CHECK: test_vcvt_n_u32_f32
-; CHECK: fcvtzu {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31
- %vcvt = tail call <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> %a, i32 31)
- ret <2 x i32> %vcvt
-}
-
-define <4 x i32> @test_vcvtq_n_u32_f32(<4 x float> %a) {
-; CHECK: test_vcvt_n_u32_f32
-; CHECK: fcvtzu {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31
- %vcvt = tail call <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> %a, i32 31)
- ret <4 x i32> %vcvt
-}
-
-define <2 x i64> @test_vcvtq_n_u64_f64(<2 x double> %a) {
-; CHECK: test_vcvtq_n_u64_f64
-; CHECK: fcvtzu {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #50
- %vcvt = tail call <2 x i64> @llvm.arm.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double> %a, i32 50)
- ret <2 x i64> %vcvt
-}
-
-declare <8 x i8> @llvm.aarch64.neon.vsrshr.v8i8(<8 x i8>, i32)
-
-declare <4 x i16> @llvm.aarch64.neon.vsrshr.v4i16(<4 x i16>, i32)
-
-declare <2 x i32> @llvm.aarch64.neon.vsrshr.v2i32(<2 x i32>, i32)
-
-declare <16 x i8> @llvm.aarch64.neon.vsrshr.v16i8(<16 x i8>, i32)
-
-declare <8 x i16> @llvm.aarch64.neon.vsrshr.v8i16(<8 x i16>, i32)
-
-declare <4 x i32> @llvm.aarch64.neon.vsrshr.v4i32(<4 x i32>, i32)
-
-declare <2 x i64> @llvm.aarch64.neon.vsrshr.v2i64(<2 x i64>, i32)
-
-declare <8 x i8> @llvm.aarch64.neon.vurshr.v8i8(<8 x i8>, i32)
-
-declare <4 x i16> @llvm.aarch64.neon.vurshr.v4i16(<4 x i16>, i32)
-
-declare <2 x i32> @llvm.aarch64.neon.vurshr.v2i32(<2 x i32>, i32)
-
-declare <16 x i8> @llvm.aarch64.neon.vurshr.v16i8(<16 x i8>, i32)
-
-declare <8 x i16> @llvm.aarch64.neon.vurshr.v8i16(<8 x i16>, i32)
-
-declare <4 x i32> @llvm.aarch64.neon.vurshr.v4i32(<4 x i32>, i32)
-
-declare <2 x i64> @llvm.aarch64.neon.vurshr.v2i64(<2 x i64>, i32)
-
-declare <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8>, <8 x i8>, i32)
-
-declare <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16>, <4 x i16>, i32)
-
-declare <2 x i32> @llvm.aarch64.neon.vsri.v2i32(<2 x i32>, <2 x i32>, i32)
-
-declare <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8>, <16 x i8>, i32)
-
-declare <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16>, <8 x i16>, i32)
-
-declare <4 x i32> @llvm.aarch64.neon.vsri.v4i32(<4 x i32>, <4 x i32>, i32)
-
-declare <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64>, <2 x i64>, i32)
-
-declare <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8>, <8 x i8>, i32)
-
-declare <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16>, <4 x i16>, i32)
-
-declare <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32>, <2 x i32>, i32)
-
-declare <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8>, <16 x i8>, i32)
-
-declare <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16>, <8 x i16>, i32)
-
-declare <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32>, <4 x i32>, i32)
-
-declare <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64>, <2 x i64>, i32)
-
-declare <8 x i8> @llvm.aarch64.neon.vsqshlu.v8i8(<8 x i8>, i32)
-
-declare <4 x i16> @llvm.aarch64.neon.vsqshlu.v4i16(<4 x i16>, i32)
-
-declare <2 x i32> @llvm.aarch64.neon.vsqshlu.v2i32(<2 x i32>, i32)
-
-declare <16 x i8> @llvm.aarch64.neon.vsqshlu.v16i8(<16 x i8>, i32)
-
-declare <8 x i16> @llvm.aarch64.neon.vsqshlu.v8i16(<8 x i16>, i32)
-
-declare <4 x i32> @llvm.aarch64.neon.vsqshlu.v4i32(<4 x i32>, i32)
-
-declare <2 x i64> @llvm.aarch64.neon.vsqshlu.v2i64(<2 x i64>, i32)
-
-declare <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8>, <8 x i8>)
-
-declare <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16>, <4 x i16>)
-
-declare <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32>, <2 x i32>)
-
-declare <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8>, <16 x i8>)
-
-declare <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16>, <8 x i16>)
-
-declare <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32>, <4 x i32>)
-
-declare <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64>, <2 x i64>)
-
-declare <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8>, <8 x i8>)
-
-declare <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16>, <4 x i16>)
-
-declare <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32>, <2 x i32>)
-
-declare <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8>, <16 x i8>)
-
-declare <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16>, <8 x i16>)
-
-declare <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32>, <4 x i32>)
-
-declare <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64>, <2 x i64>)
-
-declare <8 x i8> @llvm.aarch64.neon.vsqshrun.v8i8(<8 x i16>, i32)
-
-declare <4 x i16> @llvm.aarch64.neon.vsqshrun.v4i16(<4 x i32>, i32)
-
-declare <2 x i32> @llvm.aarch64.neon.vsqshrun.v2i32(<2 x i64>, i32)
-
-declare <8 x i8> @llvm.aarch64.neon.vrshrn.v8i8(<8 x i16>, i32)
-
-declare <4 x i16> @llvm.aarch64.neon.vrshrn.v4i16(<4 x i32>, i32)
-
-declare <2 x i32> @llvm.aarch64.neon.vrshrn.v2i32(<2 x i64>, i32)
-
-declare <8 x i8> @llvm.aarch64.neon.vsqrshrun.v8i8(<8 x i16>, i32)
-
-declare <4 x i16> @llvm.aarch64.neon.vsqrshrun.v4i16(<4 x i32>, i32)
-
-declare <2 x i32> @llvm.aarch64.neon.vsqrshrun.v2i32(<2 x i64>, i32)
-
-declare <8 x i8> @llvm.aarch64.neon.vsqshrn.v8i8(<8 x i16>, i32)
-
-declare <4 x i16> @llvm.aarch64.neon.vsqshrn.v4i16(<4 x i32>, i32)
-
-declare <2 x i32> @llvm.aarch64.neon.vsqshrn.v2i32(<2 x i64>, i32)
-
-declare <8 x i8> @llvm.aarch64.neon.vuqshrn.v8i8(<8 x i16>, i32)
-
-declare <4 x i16> @llvm.aarch64.neon.vuqshrn.v4i16(<4 x i32>, i32)
-
-declare <2 x i32> @llvm.aarch64.neon.vuqshrn.v2i32(<2 x i64>, i32)
-
-declare <8 x i8> @llvm.aarch64.neon.vsqrshrn.v8i8(<8 x i16>, i32)
-
-declare <4 x i16> @llvm.aarch64.neon.vsqrshrn.v4i16(<4 x i32>, i32)
-
-declare <2 x i32> @llvm.aarch64.neon.vsqrshrn.v2i32(<2 x i64>, i32)
-
-declare <8 x i8> @llvm.aarch64.neon.vuqrshrn.v8i8(<8 x i16>, i32)
-
-declare <4 x i16> @llvm.aarch64.neon.vuqrshrn.v4i16(<4 x i32>, i32)
-
-declare <2 x i32> @llvm.aarch64.neon.vuqrshrn.v2i32(<2 x i64>, i32)
-
-declare <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32)
-
-declare <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32)
-
-declare <2 x double> @llvm.arm.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64>, i32)
-
-declare <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32)
-
-declare <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32)
-
-declare <2 x double> @llvm.arm.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64>, i32)
-
-declare <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float>, i32)
-
-declare <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float>, i32)
-
-declare <2 x i64> @llvm.arm.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double>, i32)
-
-declare <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float>, i32)
-
-declare <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float>, i32)
-
-declare <2 x i64> @llvm.arm.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double>, i32)
-
-define <1 x i64> @test_vcvt_n_s64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvt_n_s64_f64
-; CHECK: fcvtzs d{{[0-9]+}}, d{{[0-9]+}}, #64
- %1 = tail call <1 x i64> @llvm.arm.neon.vcvtfp2fxs.v1i64.v1f64(<1 x double> %a, i32 64)
- ret <1 x i64> %1
-}
-
-define <1 x i64> @test_vcvt_n_u64_f64(<1 x double> %a) {
-; CHECK-LABEL: test_vcvt_n_u64_f64
-; CHECK: fcvtzu d{{[0-9]+}}, d{{[0-9]+}}, #64
- %1 = tail call <1 x i64> @llvm.arm.neon.vcvtfp2fxu.v1i64.v1f64(<1 x double> %a, i32 64)
- ret <1 x i64> %1
-}
-
-define <1 x double> @test_vcvt_n_f64_s64(<1 x i64> %a) {
-; CHECK-LABEL: test_vcvt_n_f64_s64
-; CHECK: scvtf d{{[0-9]+}}, d{{[0-9]+}}, #64
- %1 = tail call <1 x double> @llvm.arm.neon.vcvtfxs2fp.v1f64.v1i64(<1 x i64> %a, i32 64)
- ret <1 x double> %1
-}
-
-define <1 x double> @test_vcvt_n_f64_u64(<1 x i64> %a) {
-; CHECK-LABEL: test_vcvt_n_f64_u64
-; CHECK: ucvtf d{{[0-9]+}}, d{{[0-9]+}}, #64
- %1 = tail call <1 x double> @llvm.arm.neon.vcvtfxu2fp.v1f64.v1i64(<1 x i64> %a, i32 64)
- ret <1 x double> %1
-}
-
-declare <1 x i64> @llvm.arm.neon.vcvtfp2fxs.v1i64.v1f64(<1 x double>, i32)
-declare <1 x i64> @llvm.arm.neon.vcvtfp2fxu.v1i64.v1f64(<1 x double>, i32)
-declare <1 x double> @llvm.arm.neon.vcvtfxs2fp.v1f64.v1i64(<1 x i64>, i32)
-declare <1 x double> @llvm.arm.neon.vcvtfxu2fp.v1f64.v1i64(<1 x i64>, i32) \ No newline at end of file
diff --git a/test/CodeGen/AArch64/neon-simd-tbl.ll b/test/CodeGen/AArch64/neon-simd-tbl.ll
deleted file mode 100644
index 7a51c0f..0000000
--- a/test/CodeGen/AArch64/neon-simd-tbl.ll
+++ /dev/null
@@ -1,828 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-declare <16 x i8> @llvm.aarch64.neon.vtbx4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
-
-declare <16 x i8> @llvm.aarch64.neon.vtbx3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
-
-declare <16 x i8> @llvm.aarch64.neon.vtbx2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
-
-declare <16 x i8> @llvm.aarch64.neon.vtbx1.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
-
-declare <8 x i8> @llvm.aarch64.neon.vtbx4.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>)
-
-declare <8 x i8> @llvm.aarch64.neon.vtbx3.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>)
-
-declare <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <8 x i8>)
-
-declare <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8(<8 x i8>, <16 x i8>, <8 x i8>)
-
-declare <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8>, <16 x i8>, <8 x i8>)
-
-declare <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8>, <8 x i8>, <8 x i8>)
-
-declare <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8>, <8 x i8>)
-
-declare <16 x i8> @llvm.aarch64.neon.vtbl4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
-
-declare <16 x i8> @llvm.aarch64.neon.vtbl3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
-
-declare <16 x i8> @llvm.aarch64.neon.vtbl2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
-
-declare <16 x i8> @llvm.aarch64.neon.vtbl1.v16i8(<16 x i8>, <16 x i8>)
-
-declare <8 x i8> @llvm.aarch64.neon.vtbl4.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>)
-
-declare <8 x i8> @llvm.aarch64.neon.vtbl3.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>)
-
-define <8 x i8> @test_vtbl1_s8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vtbl1_s8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
- %vtbl1.i = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %b)
- ret <8 x i8> %vtbl11.i
-}
-
-define <8 x i8> @test_vqtbl1_s8(<16 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vqtbl1_s8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
- %vtbl1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %a, <8 x i8> %b)
- ret <8 x i8> %vtbl1.i
-}
-
-define <8 x i8> @test_vtbl2_s8([2 x <8 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vtbl2_s8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
- %__a.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 0
- %__a.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 1
- %vtbl1.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %vtbl17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %b)
- ret <8 x i8> %vtbl17.i
-}
-
-define <8 x i8> @test_vqtbl2_s8([2 x <16 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vqtbl2_s8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
- %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0
- %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1
- %vtbl2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <8 x i8> %b)
- ret <8 x i8> %vtbl2.i
-}
-
-define <8 x i8> @test_vtbl3_s8([3 x <8 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vtbl3_s8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
- %__a.coerce.fca.0.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 0
- %__a.coerce.fca.1.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 1
- %__a.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 2
- %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %vtbl211.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %b)
- ret <8 x i8> %vtbl212.i
-}
-
-define <8 x i8> @test_vqtbl3_s8([3 x <16 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vqtbl3_s8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
- %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0
- %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1
- %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2
- %vtbl3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl3.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %b)
- ret <8 x i8> %vtbl3.i
-}
-
-define <8 x i8> @test_vtbl4_s8([4 x <8 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vtbl4_s8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
- %__a.coerce.fca.0.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 0
- %__a.coerce.fca.1.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 1
- %__a.coerce.fca.2.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 2
- %__a.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 3
- %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %vtbl215.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %__a.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %vtbl216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl215.i, <8 x i8> %b)
- ret <8 x i8> %vtbl216.i
-}
-
-define <8 x i8> @test_vqtbl4_s8([4 x <16 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vqtbl4_s8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
- %__a.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 0
- %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1
- %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2
- %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3
- %vtbl4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl4.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <8 x i8> %b)
- ret <8 x i8> %vtbl4.i
-}
-
-define <16 x i8> @test_vqtbl1q_s8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vqtbl1q_s8:
-; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
- %vtbl1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl1.v16i8(<16 x i8> %a, <16 x i8> %b)
- ret <16 x i8> %vtbl1.i
-}
-
-define <16 x i8> @test_vqtbl2q_s8([2 x <16 x i8>] %a.coerce, <16 x i8> %b) {
-; CHECK: test_vqtbl2q_s8:
-; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
- %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0
- %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1
- %vtbl2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl2.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %b)
- ret <16 x i8> %vtbl2.i
-}
-
-define <16 x i8> @test_vqtbl3q_s8([3 x <16 x i8>] %a.coerce, <16 x i8> %b) {
-; CHECK: test_vqtbl3q_s8:
-; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
- %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0
- %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1
- %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2
- %vtbl3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl3.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %b)
- ret <16 x i8> %vtbl3.i
-}
-
-define <16 x i8> @test_vqtbl4q_s8([4 x <16 x i8>] %a.coerce, <16 x i8> %b) {
-; CHECK: test_vqtbl4q_s8:
-; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
- %__a.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 0
- %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1
- %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2
- %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3
- %vtbl4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl4.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <16 x i8> %b)
- ret <16 x i8> %vtbl4.i
-}
-
-define <8 x i8> @test_vtbx1_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
-; CHECK: test_vtbx1_s8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
- %vtbl1.i = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %c)
- %0 = icmp uge <8 x i8> %c, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
- %1 = sext <8 x i1> %0 to <8 x i8>
- %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl11.i)
- ret <8 x i8> %vbsl.i
-}
-
-define <8 x i8> @test_vtbx2_s8(<8 x i8> %a, [2 x <8 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vtbx2_s8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
- %__b.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 0
- %__b.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 1
- %vtbx1.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %vtbx17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8(<8 x i8> %a, <16 x i8> %vtbx1.i, <8 x i8> %c)
- ret <8 x i8> %vtbx17.i
-}
-
-define <8 x i8> @test_vtbx3_s8(<8 x i8> %a, [3 x <8 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vtbx3_s8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
- %__b.coerce.fca.0.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 0
- %__b.coerce.fca.1.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 1
- %__b.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 2
- %vtbl2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %vtbl211.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %c)
- %0 = icmp uge <8 x i8> %c, <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24>
- %1 = sext <8 x i1> %0 to <8 x i8>
- %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl212.i)
- ret <8 x i8> %vbsl.i
-}
-
-define <8 x i8> @test_vtbx4_s8(<8 x i8> %a, [4 x <8 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vtbx4_s8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
- %__b.coerce.fca.0.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 0
- %__b.coerce.fca.1.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 1
- %__b.coerce.fca.2.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 2
- %__b.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 3
- %vtbx2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %vtbx215.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %__b.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %vtbx216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8(<8 x i8> %a, <16 x i8> %vtbx2.i, <16 x i8> %vtbx215.i, <8 x i8> %c)
- ret <8 x i8> %vtbx216.i
-}
-
-define <8 x i8> @test_vqtbx1_s8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) {
-; CHECK: test_vqtbx1_s8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
- %vtbx1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c)
- ret <8 x i8> %vtbx1.i
-}
-
-define <8 x i8> @test_vqtbx2_s8(<8 x i8> %a, [2 x <16 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vqtbx2_s8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
- %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0
- %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1
- %vtbx2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <8 x i8> %c)
- ret <8 x i8> %vtbx2.i
-}
-
-define <8 x i8> @test_vqtbx3_s8(<8 x i8> %a, [3 x <16 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vqtbx3_s8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
- %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0
- %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1
- %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2
- %vtbx3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx3.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %c)
- ret <8 x i8> %vtbx3.i
-}
-
-define <8 x i8> @test_vqtbx4_s8(<8 x i8> %a, [4 x <16 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vqtbx4_s8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
- %__b.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 0
- %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1
- %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2
- %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3
- %vtbx4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx4.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <8 x i8> %c)
- ret <8 x i8> %vtbx4.i
-}
-
-define <16 x i8> @test_vqtbx1q_s8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK: test_vqtbx1q_s8:
-; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
- %vtbx1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx1.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
- ret <16 x i8> %vtbx1.i
-}
-
-define <16 x i8> @test_vqtbx2q_s8(<16 x i8> %a, [2 x <16 x i8>] %b.coerce, <16 x i8> %c) {
-; CHECK: test_vqtbx2q_s8:
-; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
- %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0
- %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1
- %vtbx2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx2.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %c)
- ret <16 x i8> %vtbx2.i
-}
-
-define <16 x i8> @test_vqtbx3q_s8(<16 x i8> %a, [3 x <16 x i8>] %b.coerce, <16 x i8> %c) {
-; CHECK: test_vqtbx3q_s8:
-; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
- %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0
- %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1
- %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2
- %vtbx3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx3.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %c)
- ret <16 x i8> %vtbx3.i
-}
-
-define <16 x i8> @test_vqtbx4q_s8(<16 x i8> %a, [4 x <16 x i8>] %b.coerce, <16 x i8> %c) {
-; CHECK: test_vqtbx4q_s8:
-; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
- %__b.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 0
- %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1
- %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2
- %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3
- %vtbx4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx4.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <16 x i8> %c)
- ret <16 x i8> %vtbx4.i
-}
-
-define <8 x i8> @test_vtbl1_u8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vtbl1_u8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
- %vtbl1.i = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %b)
- ret <8 x i8> %vtbl11.i
-}
-
-define <8 x i8> @test_vqtbl1_u8(<16 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vqtbl1_u8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
- %vtbl1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %a, <8 x i8> %b)
- ret <8 x i8> %vtbl1.i
-}
-
-define <8 x i8> @test_vtbl2_u8([2 x <8 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vtbl2_u8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
- %__a.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 0
- %__a.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 1
- %vtbl1.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %vtbl17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %b)
- ret <8 x i8> %vtbl17.i
-}
-
-define <8 x i8> @test_vqtbl2_u8([2 x <16 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vqtbl2_u8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
- %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0
- %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1
- %vtbl2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <8 x i8> %b)
- ret <8 x i8> %vtbl2.i
-}
-
-define <8 x i8> @test_vtbl3_u8([3 x <8 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vtbl3_u8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
- %__a.coerce.fca.0.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 0
- %__a.coerce.fca.1.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 1
- %__a.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 2
- %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %vtbl211.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %b)
- ret <8 x i8> %vtbl212.i
-}
-
-define <8 x i8> @test_vqtbl3_u8([3 x <16 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vqtbl3_u8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
- %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0
- %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1
- %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2
- %vtbl3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl3.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %b)
- ret <8 x i8> %vtbl3.i
-}
-
-define <8 x i8> @test_vtbl4_u8([4 x <8 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vtbl4_u8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
- %__a.coerce.fca.0.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 0
- %__a.coerce.fca.1.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 1
- %__a.coerce.fca.2.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 2
- %__a.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 3
- %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %vtbl215.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %__a.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %vtbl216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl215.i, <8 x i8> %b)
- ret <8 x i8> %vtbl216.i
-}
-
-define <8 x i8> @test_vqtbl4_u8([4 x <16 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vqtbl4_u8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
- %__a.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 0
- %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1
- %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2
- %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3
- %vtbl4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl4.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <8 x i8> %b)
- ret <8 x i8> %vtbl4.i
-}
-
-define <16 x i8> @test_vqtbl1q_u8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vqtbl1q_u8:
-; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
- %vtbl1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl1.v16i8(<16 x i8> %a, <16 x i8> %b)
- ret <16 x i8> %vtbl1.i
-}
-
-define <16 x i8> @test_vqtbl2q_u8([2 x <16 x i8>] %a.coerce, <16 x i8> %b) {
-; CHECK: test_vqtbl2q_u8:
-; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
- %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0
- %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1
- %vtbl2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl2.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %b)
- ret <16 x i8> %vtbl2.i
-}
-
-define <16 x i8> @test_vqtbl3q_u8([3 x <16 x i8>] %a.coerce, <16 x i8> %b) {
-; CHECK: test_vqtbl3q_u8:
-; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
- %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0
- %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1
- %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2
- %vtbl3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl3.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %b)
- ret <16 x i8> %vtbl3.i
-}
-
-define <16 x i8> @test_vqtbl4q_u8([4 x <16 x i8>] %a.coerce, <16 x i8> %b) {
-; CHECK: test_vqtbl4q_u8:
-; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
- %__a.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 0
- %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1
- %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2
- %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3
- %vtbl4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl4.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <16 x i8> %b)
- ret <16 x i8> %vtbl4.i
-}
-
-define <8 x i8> @test_vtbx1_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
-; CHECK: test_vtbx1_u8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
- %vtbl1.i = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %c)
- %0 = icmp uge <8 x i8> %c, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
- %1 = sext <8 x i1> %0 to <8 x i8>
- %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl11.i)
- ret <8 x i8> %vbsl.i
-}
-
-define <8 x i8> @test_vtbx2_u8(<8 x i8> %a, [2 x <8 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vtbx2_u8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
- %__b.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 0
- %__b.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 1
- %vtbx1.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %vtbx17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8(<8 x i8> %a, <16 x i8> %vtbx1.i, <8 x i8> %c)
- ret <8 x i8> %vtbx17.i
-}
-
-define <8 x i8> @test_vtbx3_u8(<8 x i8> %a, [3 x <8 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vtbx3_u8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
- %__b.coerce.fca.0.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 0
- %__b.coerce.fca.1.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 1
- %__b.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 2
- %vtbl2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %vtbl211.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %c)
- %0 = icmp uge <8 x i8> %c, <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24>
- %1 = sext <8 x i1> %0 to <8 x i8>
- %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl212.i)
- ret <8 x i8> %vbsl.i
-}
-
-define <8 x i8> @test_vtbx4_u8(<8 x i8> %a, [4 x <8 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vtbx4_u8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
- %__b.coerce.fca.0.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 0
- %__b.coerce.fca.1.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 1
- %__b.coerce.fca.2.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 2
- %__b.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 3
- %vtbx2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %vtbx215.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %__b.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %vtbx216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8(<8 x i8> %a, <16 x i8> %vtbx2.i, <16 x i8> %vtbx215.i, <8 x i8> %c)
- ret <8 x i8> %vtbx216.i
-}
-
-define <8 x i8> @test_vqtbx1_u8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) {
-; CHECK: test_vqtbx1_u8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
- %vtbx1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c)
- ret <8 x i8> %vtbx1.i
-}
-
-define <8 x i8> @test_vqtbx2_u8(<8 x i8> %a, [2 x <16 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vqtbx2_u8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
- %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0
- %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1
- %vtbx2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <8 x i8> %c)
- ret <8 x i8> %vtbx2.i
-}
-
-define <8 x i8> @test_vqtbx3_u8(<8 x i8> %a, [3 x <16 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vqtbx3_u8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
- %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0
- %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1
- %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2
- %vtbx3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx3.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %c)
- ret <8 x i8> %vtbx3.i
-}
-
-define <8 x i8> @test_vqtbx4_u8(<8 x i8> %a, [4 x <16 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vqtbx4_u8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
- %__b.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 0
- %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1
- %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2
- %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3
- %vtbx4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx4.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <8 x i8> %c)
- ret <8 x i8> %vtbx4.i
-}
-
-define <16 x i8> @test_vqtbx1q_u8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK: test_vqtbx1q_u8:
-; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
- %vtbx1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx1.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
- ret <16 x i8> %vtbx1.i
-}
-
-define <16 x i8> @test_vqtbx2q_u8(<16 x i8> %a, [2 x <16 x i8>] %b.coerce, <16 x i8> %c) {
-; CHECK: test_vqtbx2q_u8:
-; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
- %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0
- %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1
- %vtbx2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx2.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %c)
- ret <16 x i8> %vtbx2.i
-}
-
-define <16 x i8> @test_vqtbx3q_u8(<16 x i8> %a, [3 x <16 x i8>] %b.coerce, <16 x i8> %c) {
-; CHECK: test_vqtbx3q_u8:
-; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
- %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0
- %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1
- %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2
- %vtbx3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx3.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %c)
- ret <16 x i8> %vtbx3.i
-}
-
-define <16 x i8> @test_vqtbx4q_u8(<16 x i8> %a, [4 x <16 x i8>] %b.coerce, <16 x i8> %c) {
-; CHECK: test_vqtbx4q_u8:
-; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
- %__b.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 0
- %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1
- %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2
- %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3
- %vtbx4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx4.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <16 x i8> %c)
- ret <16 x i8> %vtbx4.i
-}
-
-define <8 x i8> @test_vtbl1_p8(<8 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vtbl1_p8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
- %vtbl1.i = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %b)
- ret <8 x i8> %vtbl11.i
-}
-
-define <8 x i8> @test_vqtbl1_p8(<16 x i8> %a, <8 x i8> %b) {
-; CHECK: test_vqtbl1_p8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
- %vtbl1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %a, <8 x i8> %b)
- ret <8 x i8> %vtbl1.i
-}
-
-define <8 x i8> @test_vtbl2_p8([2 x <8 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vtbl2_p8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
- %__a.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 0
- %__a.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 1
- %vtbl1.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %vtbl17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %b)
- ret <8 x i8> %vtbl17.i
-}
-
-define <8 x i8> @test_vqtbl2_p8([2 x <16 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vqtbl2_p8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
- %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0
- %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1
- %vtbl2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <8 x i8> %b)
- ret <8 x i8> %vtbl2.i
-}
-
-define <8 x i8> @test_vtbl3_p8([3 x <8 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vtbl3_p8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
- %__a.coerce.fca.0.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 0
- %__a.coerce.fca.1.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 1
- %__a.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 2
- %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %vtbl211.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %b)
- ret <8 x i8> %vtbl212.i
-}
-
-define <8 x i8> @test_vqtbl3_p8([3 x <16 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vqtbl3_p8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
- %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0
- %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1
- %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2
- %vtbl3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl3.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %b)
- ret <8 x i8> %vtbl3.i
-}
-
-define <8 x i8> @test_vtbl4_p8([4 x <8 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vtbl4_p8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
- %__a.coerce.fca.0.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 0
- %__a.coerce.fca.1.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 1
- %__a.coerce.fca.2.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 2
- %__a.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 3
- %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %vtbl215.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %__a.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %vtbl216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl215.i, <8 x i8> %b)
- ret <8 x i8> %vtbl216.i
-}
-
-define <8 x i8> @test_vqtbl4_p8([4 x <16 x i8>] %a.coerce, <8 x i8> %b) {
-; CHECK: test_vqtbl4_p8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
- %__a.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 0
- %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1
- %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2
- %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3
- %vtbl4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl4.v8i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <8 x i8> %b)
- ret <8 x i8> %vtbl4.i
-}
-
-define <16 x i8> @test_vqtbl1q_p8(<16 x i8> %a, <16 x i8> %b) {
-; CHECK: test_vqtbl1q_p8:
-; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
- %vtbl1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl1.v16i8(<16 x i8> %a, <16 x i8> %b)
- ret <16 x i8> %vtbl1.i
-}
-
-define <16 x i8> @test_vqtbl2q_p8([2 x <16 x i8>] %a.coerce, <16 x i8> %b) {
-; CHECK: test_vqtbl2q_p8:
-; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
- %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0
- %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1
- %vtbl2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl2.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %b)
- ret <16 x i8> %vtbl2.i
-}
-
-define <16 x i8> @test_vqtbl3q_p8([3 x <16 x i8>] %a.coerce, <16 x i8> %b) {
-; CHECK: test_vqtbl3q_p8:
-; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
- %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0
- %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1
- %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2
- %vtbl3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl3.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %b)
- ret <16 x i8> %vtbl3.i
-}
-
-define <16 x i8> @test_vqtbl4q_p8([4 x <16 x i8>] %a.coerce, <16 x i8> %b) {
-; CHECK: test_vqtbl4q_p8:
-; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
- %__a.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 0
- %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1
- %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2
- %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3
- %vtbl4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl4.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <16 x i8> %b)
- ret <16 x i8> %vtbl4.i
-}
-
-define <8 x i8> @test_vtbx1_p8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
-; CHECK: test_vtbx1_p8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
- %vtbl1.i = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8(<16 x i8> %vtbl1.i, <8 x i8> %c)
- %0 = icmp uge <8 x i8> %c, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
- %1 = sext <8 x i1> %0 to <8 x i8>
- %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl11.i)
- ret <8 x i8> %vbsl.i
-}
-
-define <8 x i8> @test_vtbx2_p8(<8 x i8> %a, [2 x <8 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vtbx2_p8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
- %__b.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 0
- %__b.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 1
- %vtbx1.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %vtbx17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8(<8 x i8> %a, <16 x i8> %vtbx1.i, <8 x i8> %c)
- ret <8 x i8> %vtbx17.i
-}
-
-define <8 x i8> @test_vtbx3_p8(<8 x i8> %a, [3 x <8 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vtbx3_p8:
-; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
- %__b.coerce.fca.0.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 0
- %__b.coerce.fca.1.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 1
- %__b.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 2
- %vtbl2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %vtbl211.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %c)
- %0 = icmp uge <8 x i8> %c, <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24>
- %1 = sext <8 x i1> %0 to <8 x i8>
- %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl212.i)
- ret <8 x i8> %vbsl.i
-}
-
-define <8 x i8> @test_vtbx4_p8(<8 x i8> %a, [4 x <8 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vtbx4_p8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
- %__b.coerce.fca.0.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 0
- %__b.coerce.fca.1.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 1
- %__b.coerce.fca.2.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 2
- %__b.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 3
- %vtbx2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %vtbx215.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %__b.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %vtbx216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8(<8 x i8> %a, <16 x i8> %vtbx2.i, <16 x i8> %vtbx215.i, <8 x i8> %c)
- ret <8 x i8> %vtbx216.i
-}
-
-define <8 x i8> @test_vqtbx1_p8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) {
-; CHECK: test_vqtbx1_p8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
- %vtbx1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c)
- ret <8 x i8> %vtbx1.i
-}
-
-define <8 x i8> @test_vqtbx2_p8(<8 x i8> %a, [2 x <16 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vqtbx2_p8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
- %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0
- %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1
- %vtbx2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <8 x i8> %c)
- ret <8 x i8> %vtbx2.i
-}
-
-define <8 x i8> @test_vqtbx3_p8(<8 x i8> %a, [3 x <16 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vqtbx3_p8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
- %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0
- %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1
- %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2
- %vtbx3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx3.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %c)
- ret <8 x i8> %vtbx3.i
-}
-
-define <8 x i8> @test_vqtbx4_p8(<8 x i8> %a, [4 x <16 x i8>] %b.coerce, <8 x i8> %c) {
-; CHECK: test_vqtbx4_p8:
-; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
-entry:
- %__b.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 0
- %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1
- %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2
- %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3
- %vtbx4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx4.v8i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <8 x i8> %c)
- ret <8 x i8> %vtbx4.i
-}
-
-define <16 x i8> @test_vqtbx1q_p8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
-; CHECK: test_vqtbx1q_p8:
-; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
- %vtbx1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx1.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
- ret <16 x i8> %vtbx1.i
-}
-
-define <16 x i8> @test_vqtbx2q_p8(<16 x i8> %a, [2 x <16 x i8>] %b.coerce, <16 x i8> %c) {
-; CHECK: test_vqtbx2q_p8:
-; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
- %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0
- %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1
- %vtbx2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx2.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %c)
- ret <16 x i8> %vtbx2.i
-}
-
-define <16 x i8> @test_vqtbx3q_p8(<16 x i8> %a, [3 x <16 x i8>] %b.coerce, <16 x i8> %c) {
-; CHECK: test_vqtbx3q_p8:
-; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
- %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0
- %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1
- %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2
- %vtbx3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx3.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %c)
- ret <16 x i8> %vtbx3.i
-}
-
-define <16 x i8> @test_vqtbx4q_p8(<16 x i8> %a, [4 x <16 x i8>] %b.coerce, <16 x i8> %c) {
-; CHECK: test_vqtbx4q_p8:
-; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
-entry:
- %__b.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 0
- %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1
- %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2
- %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3
- %vtbx4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx4.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <16 x i8> %c)
- ret <16 x i8> %vtbx4.i
-}
-
diff --git a/test/CodeGen/AArch64/neon-spill-fpr8-fpr16.ll b/test/CodeGen/AArch64/neon-spill-fpr8-fpr16.ll
deleted file mode 100644
index bb3300e..0000000
--- a/test/CodeGen/AArch64/neon-spill-fpr8-fpr16.ll
+++ /dev/null
@@ -1,30 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-; This file tests the spill of FPR8/FPR16. The volatile loads/stores force the
-; allocator to keep the value live until it's needed.
-
-%bigtype_v1i8 = type [20 x <1 x i8>]
-
-define void @spill_fpr8(%bigtype_v1i8* %addr) {
-; CHECK-LABEL: spill_fpr8:
-; CHECK: 1-byte Folded Spill
-; CHECK: 1-byte Folded Reload
- %val1 = load volatile %bigtype_v1i8* %addr
- %val2 = load volatile %bigtype_v1i8* %addr
- store volatile %bigtype_v1i8 %val1, %bigtype_v1i8* %addr
- store volatile %bigtype_v1i8 %val2, %bigtype_v1i8* %addr
- ret void
-}
-
-%bigtype_v1i16 = type [20 x <1 x i16>]
-
-define void @spill_fpr16(%bigtype_v1i16* %addr) {
-; CHECK-LABEL: spill_fpr16:
-; CHECK: 2-byte Folded Spill
-; CHECK: 2-byte Folded Reload
- %val1 = load volatile %bigtype_v1i16* %addr
- %val2 = load volatile %bigtype_v1i16* %addr
- store volatile %bigtype_v1i16 %val1, %bigtype_v1i16* %addr
- store volatile %bigtype_v1i16 %val2, %bigtype_v1i16* %addr
- ret void
-} \ No newline at end of file
diff --git a/test/CodeGen/AArch64/neon-truncStore-extLoad.ll b/test/CodeGen/AArch64/neon-truncStore-extLoad.ll
index e5b7694..1df3719 100644
--- a/test/CodeGen/AArch64/neon-truncStore-extLoad.ll
+++ b/test/CodeGen/AArch64/neon-truncStore-extLoad.ll
@@ -5,7 +5,7 @@
define void @truncStore.v2i64(<2 x i64> %a, <2 x i32>* %result) {
; CHECK-LABEL: truncStore.v2i64:
; CHECK: xtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
-; CHECK: st1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
+; CHECK: {{st1 { v[0-9]+.2s }|str d[0-9]+}}, [x{{[0-9]+|sp}}]
%b = trunc <2 x i64> %a to <2 x i32>
store <2 x i32> %b, <2 x i32>* %result
ret void
@@ -14,7 +14,7 @@ define void @truncStore.v2i64(<2 x i64> %a, <2 x i32>* %result) {
define void @truncStore.v4i32(<4 x i32> %a, <4 x i16>* %result) {
; CHECK-LABEL: truncStore.v4i32:
; CHECK: xtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
-; CHECK: st1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
+; CHECK: {{st1 { v[0-9]+.4h }|str d[0-9]+}}, [x{{[0-9]+|sp}}]
%b = trunc <4 x i32> %a to <4 x i16>
store <4 x i16> %b, <4 x i16>* %result
ret void
@@ -23,7 +23,7 @@ define void @truncStore.v4i32(<4 x i32> %a, <4 x i16>* %result) {
define void @truncStore.v8i16(<8 x i16> %a, <8 x i8>* %result) {
; CHECK-LABEL: truncStore.v8i16:
; CHECK: xtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
-; CHECK: st1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
+; CHECK: {{st1 { v[0-9]+.8b }|str d[0-9]+}}, [x{{[0-9]+|sp}}]
%b = trunc <8 x i16> %a to <8 x i8>
store <8 x i8> %b, <8 x i8>* %result
ret void
@@ -54,4 +54,4 @@ define i32 @loadExt.i32(<4 x i8>* %ref) {
%vecext = extractelement <4 x i8> %a, i32 0
%conv = zext i8 %vecext to i32
ret i32 %conv
-} \ No newline at end of file
+}
diff --git a/test/CodeGen/AArch64/nzcv-save.ll b/test/CodeGen/AArch64/nzcv-save.ll
new file mode 100644
index 0000000..32baff3
--- /dev/null
+++ b/test/CodeGen/AArch64/nzcv-save.ll
@@ -0,0 +1,18 @@
+; RUN: llc -march=aarch64 < %s | FileCheck %s
+
+; CHECK: mrs [[NZCV_SAVE:x[0-9]+]], NZCV
+; CHECK: msr NZCV, [[NZCV_SAVE]]
+
+; DAG ends up with two uses for the flags from an ADCS node, which means they
+; must be saved for later.
+define void @f(i256* nocapture %a, i256* nocapture %b, i256* nocapture %cc, i256* nocapture %dd) nounwind uwtable noinline ssp {
+entry:
+ %c = load i256* %cc
+ %d = load i256* %dd
+ %add = add nsw i256 %c, %d
+ store i256 %add, i256* %a, align 8
+ %or = or i256 %c, 1606938044258990275541962092341162602522202993782792835301376
+ %add6 = add nsw i256 %or, %d
+ store i256 %add6, i256* %b, align 8
+ ret void
+}
diff --git a/test/CodeGen/AArch64/pic-eh-stubs.ll b/test/CodeGen/AArch64/pic-eh-stubs.ll
index 3404d3f..e8c7625 100644
--- a/test/CodeGen/AArch64/pic-eh-stubs.ll
+++ b/test/CodeGen/AArch64/pic-eh-stubs.ll
@@ -1,5 +1,5 @@
; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -o - %s | FileCheck %s
-; RUN: llc -mtriple=aarch64_be-none-linux-gnu -relocation-model=pic -o - %s | FileCheck %s
+; RUN: llc -mtriple=arm64_be-none-linux-gnu -relocation-model=pic -o - %s | FileCheck %s
; Make sure exception-handling PIC code can be linked correctly. An alternative
; to the sequence described below would have .gcc_except_table itself writable
@@ -11,8 +11,8 @@
; ... referring indirectly to stubs for its typeinfo ...
; CHECK: // @TType Encoding = indirect pcrel sdata8
; ... one of which is "int"'s typeinfo
-; CHECK: .Ltmp7:
-; CHECK-NEXT: .xword .L_ZTIi.DW.stub-.Ltmp7
+; CHECK: [[TYPEINFO_LBL:.Ltmp[0-9]+]]: // TypeInfo 1
+; CHECK-NEXT: .xword .L_ZTIi.DW.stub-[[TYPEINFO_LBL]]
; .. and which is properly defined (in a writable section for the dynamic loader) later.
; CHECK: .section .data.rel,"aw"
diff --git a/test/CodeGen/AArch64/ragreedy-csr.ll b/test/CodeGen/AArch64/ragreedy-csr.ll
index 18a948b..de29b1b 100644
--- a/test/CodeGen/AArch64/ragreedy-csr.ll
+++ b/test/CodeGen/AArch64/ragreedy-csr.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -regalloc=greedy -regalloc-csr-first-time-cost=15 | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -regalloc=greedy -regalloc-csr-first-time-cost=15 | FileCheck %s
; This testing case is reduced from 197.parser prune_match function.
; We make sure that we do not use callee-saved registers (x19 to x25).
@@ -6,14 +6,14 @@
; CHECK-LABEL: prune_match:
; CHECK: entry
-; CHECK: str x30, [sp
+; CHECK: {{str x30|stp x29, x30}}, [sp
; CHECK-NOT: stp x25,
; CHECK-NOT: stp x23, x24
; CHECK-NOT: stp x21, x22
; CHECK-NOT: stp x19, x20
; CHECK: if.end
; CHECK: return
-; CHECK: ldr x30, [sp
+; CHECK: {{ldr x30|ldp x29, x30}}, [sp
; CHECK-NOT: ldp x19, x20
; CHECK-NOT: ldp x21, x22
; CHECK-NOT: ldp x23, x24
diff --git a/test/CodeGen/AArch64/regress-bitcast-formals.ll b/test/CodeGen/AArch64/regress-bitcast-formals.ll
index 9655f90..58e0542 100644
--- a/test/CodeGen/AArch64/regress-bitcast-formals.ll
+++ b/test/CodeGen/AArch64/regress-bitcast-formals.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-apple-ios7.0 -verify-machineinstrs < %s | FileCheck %s
; CallingConv.td requires a bitcast for vector arguments. Make sure we're
; actually capable of that (the test was omitted from LowerFormalArguments).
diff --git a/test/CodeGen/AArch64/regress-f128csel-flags.ll b/test/CodeGen/AArch64/regress-f128csel-flags.ll
index b35185c..25b5e0c 100644
--- a/test/CodeGen/AArch64/regress-f128csel-flags.ll
+++ b/test/CodeGen/AArch64/regress-f128csel-flags.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
; We used to not mark NZCV as being used in the continuation basic-block
; when lowering a 128-bit "select" to branches. This meant a subsequent use
diff --git a/test/CodeGen/AArch64/regress-fp128-livein.ll b/test/CodeGen/AArch64/regress-fp128-livein.ll
index cb8432a..5e6ab0a 100644
--- a/test/CodeGen/AArch64/regress-fp128-livein.ll
+++ b/test/CodeGen/AArch64/regress-fp128-livein.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s
; Regression test for NZCV reg live-in not being added to fp128csel IfTrue BB,
; causing a crash during live range calc.
diff --git a/test/CodeGen/AArch64/regress-tail-livereg.ll b/test/CodeGen/AArch64/regress-tail-livereg.ll
index 053249c..e32ac84 100644
--- a/test/CodeGen/AArch64/regress-tail-livereg.ll
+++ b/test/CodeGen/AArch64/regress-tail-livereg.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=arm64-apple-ios7.0 -o - %s | FileCheck %s
@var = global void()* zeroinitializer
declare void @bar()
diff --git a/test/CodeGen/AArch64/regress-tblgen-chains.ll b/test/CodeGen/AArch64/regress-tblgen-chains.ll
index ff77fb4..477d996 100644
--- a/test/CodeGen/AArch64/regress-tblgen-chains.ll
+++ b/test/CodeGen/AArch64/regress-tblgen-chains.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=arm64-apple-ios7.0 -o - %s | FileCheck %s
; When generating DAG selection tables, TableGen used to only flag an
; instruction as needing a chain on its own account if it had a built-in pattern
@@ -17,17 +17,18 @@ define i64 @test_chains() {
%locvar = alloca i8
call void @bar(i8* %locvar)
-; CHECK: bl bar
+; CHECK: bl {{_?bar}}
%inc.1 = load i8* %locvar
%inc.2 = zext i8 %inc.1 to i64
%inc.3 = add i64 %inc.2, 1
%inc.4 = trunc i64 %inc.3 to i8
store i8 %inc.4, i8* %locvar
-; CHECK: ldrb {{w[0-9]+}}, [sp, [[LOCADDR:#[0-9]+]]]
-; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #1
-; CHECK: strb {{w[0-9]+}}, [sp, [[LOCADDR]]]
-; CHECK: ldrb {{w[0-9]+}}, [sp, [[LOCADDR]]]
+
+; CHECK: ldurb {{w[0-9]+}}, [x29, [[LOCADDR:#-?[0-9]+]]]
+; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, #1
+; CHECK: sturb {{w[0-9]+}}, [x29, [[LOCADDR]]]
+; CHECK: ldurb {{w[0-9]+}}, [x29, [[LOCADDR]]]
%ret.1 = load i8* %locvar
%ret.2 = zext i8 %ret.1 to i64
diff --git a/test/CodeGen/AArch64/regress-w29-reserved-with-fp.ll b/test/CodeGen/AArch64/regress-w29-reserved-with-fp.ll
index 0ef9818..c3167e4 100644
--- a/test/CodeGen/AArch64/regress-w29-reserved-with-fp.ll
+++ b/test/CodeGen/AArch64/regress-w29-reserved-with-fp.ll
@@ -5,22 +5,7 @@ declare void @bar()
define void @test_w29_reserved() {
; CHECK-LABEL: test_w29_reserved:
-; CHECK: .cfi_startproc
-; CHECK: .cfi_def_cfa sp, 96
; CHECK: add x29, sp, #{{[0-9]+}}
-; CHECK: .cfi_def_cfa x29, 16
-; CHECK: .cfi_offset x30, -8
-; CHECK: .cfi_offset x29, -16
-; CHECK: .cfi_offset x28, -24
-; CHECK: .cfi_offset x27, -32
-; CHECK: .cfi_offset x26, -40
-; CHECK: .cfi_offset x25, -48
-; CHECK: .cfi_offset x24, -56
-; CHECK: .cfi_offset x23, -64
-; CHECK: .cfi_offset x22, -72
-; CHECK: .cfi_offset x21, -80
-; CHECK: .cfi_offset x20, -88
-; CHECK: .cfi_offset x19, -96
%val1 = load volatile i32* @var
%val2 = load volatile i32* @var
diff --git a/test/CodeGen/AArch64/regress-wzr-allocatable.ll b/test/CodeGen/AArch64/regress-wzr-allocatable.ll
deleted file mode 100644
index 764d2bc..0000000
--- a/test/CodeGen/AArch64/regress-wzr-allocatable.ll
+++ /dev/null
@@ -1,41 +0,0 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -O0
-
-; When WZR wasn't marked as reserved, this function tried to allocate
-; it at O0 and then generated an internal fault (mostly incidentally)
-; when it discovered that it was already in use for a multiplication.
-
-; I'm not really convinced this is a good test since it could easily
-; stop testing what it does now with no-one any the wiser. However, I
-; can't think of a better way to force the allocator to use WZR
-; specifically.
-
-define void @test() nounwind {
-entry:
- br label %for.cond
-
-for.cond: ; preds = %for.body, %entry
- br i1 undef, label %for.body, label %for.end
-
-for.body: ; preds = %for.cond
- br label %for.cond
-
-for.end: ; preds = %for.cond
- br label %for.cond6
-
-for.cond6: ; preds = %for.body9, %for.end
- br i1 undef, label %for.body9, label %while.cond30
-
-for.body9: ; preds = %for.cond6
- store i16 0, i16* undef, align 2
- %0 = load i32* undef, align 4
- %1 = load i32* undef, align 4
- %mul15 = mul i32 %0, %1
- %add16 = add i32 %mul15, 32768
- %div = udiv i32 %add16, 65535
- %add17 = add i32 %div, 1
- store i32 %add17, i32* undef, align 4
- br label %for.cond6
-
-while.cond30: ; preds = %for.cond6
- ret void
-}
diff --git a/test/CodeGen/AArch64/returnaddr.ll b/test/CodeGen/AArch64/returnaddr.ll
index c85f9ec..b136f04 100644
--- a/test/CodeGen/AArch64/returnaddr.ll
+++ b/test/CodeGen/AArch64/returnaddr.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
define i8* @rt0(i32 %x) nounwind readnone {
entry:
diff --git a/test/CodeGen/AArch64/setcc-takes-i32.ll b/test/CodeGen/AArch64/setcc-takes-i32.ll
index bd79685..ec86159 100644
--- a/test/CodeGen/AArch64/setcc-takes-i32.ll
+++ b/test/CodeGen/AArch64/setcc-takes-i32.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -o - %s | FileCheck %s
; Most important point here is that the promotion of the i1 works
; correctly. Previously LLVM thought that i64 was the appropriate SetCC output,
diff --git a/test/CodeGen/AArch64/sext_inreg.ll b/test/CodeGen/AArch64/sext_inreg.ll
deleted file mode 100644
index 2f76081..0000000
--- a/test/CodeGen/AArch64/sext_inreg.ll
+++ /dev/null
@@ -1,198 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-; For formal arguments, we have the following vector type promotion,
-; v2i8 is promoted to v2i32(f64)
-; v2i16 is promoted to v2i32(f64)
-; v4i8 is promoted to v4i16(f64)
-; v8i1 is promoted to v8i16(f128)
-
-define <2 x i8> @test_sext_inreg_v2i8i16(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v2i8i16
-; CHECK: sshll v0.8h, v0.8b, #0
-; CHECK-NEXT: uzp1 v0.8h, v0.8h, v0.8h
-; CHECK-NEXT: sshll v1.8h, v1.8b, #0
-; CHECK-NEXT: uzp1 v1.8h, v1.8h, v1.8h
- %1 = sext <2 x i8> %v1 to <2 x i16>
- %2 = sext <2 x i8> %v2 to <2 x i16>
- %3 = shufflevector <2 x i16> %1, <2 x i16> %2, <2 x i32> <i32 0, i32 2>
- %4 = trunc <2 x i16> %3 to <2 x i8>
- ret <2 x i8> %4
-}
-
-define <2 x i8> @test_sext_inreg_v2i8i16_2(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v2i8i16_2
-; CHECK: sshll v0.8h, v0.8b, #0
-; CHECK-NEXT: uzp1 v0.8h, v0.8h, v0.8h
-; CHECK-NEXT: sshll v1.8h, v1.8b, #0
-; CHECK-NEXT: uzp1 v1.8h, v1.8h, v1.8h
- %a1 = shl <2 x i32> %v1, <i32 24, i32 24>
- %a2 = ashr <2 x i32> %a1, <i32 24, i32 24>
- %b1 = shl <2 x i32> %v2, <i32 24, i32 24>
- %b2 = ashr <2 x i32> %b1, <i32 24, i32 24>
- %c = shufflevector <2 x i32> %a2, <2 x i32> %b2, <2 x i32> <i32 0, i32 2>
- %d = trunc <2 x i32> %c to <2 x i8>
- ret <2 x i8> %d
-}
-
-define <2 x i8> @test_sext_inreg_v2i8i32(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v2i8i32
-; CHECK: sshll v0.8h, v0.8b, #0
-; CHECK-NEXT: uzp1 v0.8h, v0.8h, v0.8h
-; CHECK-NEXT: sshll v1.8h, v1.8b, #0
-; CHECK-NEXT: uzp1 v1.8h, v1.8h, v1.8h
- %1 = sext <2 x i8> %v1 to <2 x i32>
- %2 = sext <2 x i8> %v2 to <2 x i32>
- %3 = shufflevector <2 x i32> %1, <2 x i32> %2, <2 x i32> <i32 0, i32 2>
- %4 = trunc <2 x i32> %3 to <2 x i8>
- ret <2 x i8> %4
-}
-
-define <2 x i8> @test_sext_inreg_v2i8i64(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v2i8i64
-; CHECK: ushll v1.2d, v1.2s, #0
-; CHECK: ushll v0.2d, v0.2s, #0
-; CHECK: shl v0.2d, v0.2d, #56
-; CHECK: sshr v0.2d, v0.2d, #56
-; CHECK: shl v1.2d, v1.2d, #56
-; CHECK: sshr v1.2d, v1.2d, #56
- %1 = sext <2 x i8> %v1 to <2 x i64>
- %2 = sext <2 x i8> %v2 to <2 x i64>
- %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 0, i32 2>
- %4 = trunc <2 x i64> %3 to <2 x i8>
- ret <2 x i8> %4
-}
-
-define <4 x i8> @test_sext_inreg_v4i8i16(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v4i8i16
-; CHECK: sshll v0.8h, v0.8b, #0
-; CHECK-NEXT: uzp1 v0.8h, v0.8h, v0.8h
-; CHECK-NEXT: sshll v1.8h, v1.8b, #0
-; CHECK-NEXT: uzp1 v1.8h, v1.8h, v1.8h
- %1 = sext <4 x i8> %v1 to <4 x i16>
- %2 = sext <4 x i8> %v2 to <4 x i16>
- %3 = shufflevector <4 x i16> %1, <4 x i16> %2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
- %4 = trunc <4 x i16> %3 to <4 x i8>
- ret <4 x i8> %4
-}
-
-define <4 x i8> @test_sext_inreg_v4i8i16_2(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v4i8i16_2
-; CHECK: sshll v0.8h, v0.8b, #0
-; CHECK-NEXT: uzp1 v0.8h, v0.8h, v0.8h
-; CHECK-NEXT: sshll v1.8h, v1.8b, #0
-; CHECK-NEXT: uzp1 v1.8h, v1.8h, v1.8h
- %a1 = shl <4 x i16> %v1, <i16 8, i16 8, i16 8, i16 8>
- %a2 = ashr <4 x i16> %a1, <i16 8, i16 8, i16 8, i16 8>
- %b1 = shl <4 x i16> %v2, <i16 8, i16 8, i16 8, i16 8>
- %b2 = ashr <4 x i16> %b1, <i16 8, i16 8, i16 8, i16 8>
- %c = shufflevector <4 x i16> %a2, <4 x i16> %b2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
- %d = trunc <4 x i16> %c to <4 x i8>
- ret <4 x i8> %d
-}
-
-define <4 x i8> @test_sext_inreg_v4i8i32(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v4i8i32
-; CHECK: ushll v1.4s, v1.4h, #0
-; CHECK: ushll v0.4s, v0.4h, #0
-; CHECK: shl v0.4s, v0.4s, #24
-; CHECK: sshr v0.4s, v0.4s, #24
-; CHECK: shl v1.4s, v1.4s, #24
-; CHECK: sshr v1.4s, v1.4s, #24
- %1 = sext <4 x i8> %v1 to <4 x i32>
- %2 = sext <4 x i8> %v2 to <4 x i32>
- %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
- %4 = trunc <4 x i32> %3 to <4 x i8>
- ret <4 x i8> %4
-}
-
-define <8 x i8> @test_sext_inreg_v8i8i16(<8 x i8> %v1, <8 x i8> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v8i8i16
-; CHECK: sshll v0.8h, v0.8b, #0
-; CHECK: sshll v1.8h, v1.8b, #0
- %1 = sext <8 x i8> %v1 to <8 x i16>
- %2 = sext <8 x i8> %v2 to <8 x i16>
- %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
- %4 = trunc <8 x i16> %3 to <8 x i8>
- ret <8 x i8> %4
-}
-
-define <8 x i1> @test_sext_inreg_v8i1i16(<8 x i1> %v1, <8 x i1> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v8i1i16
-; CHECK: ushll v1.8h, v1.8b, #0
-; CHECK: ushll v0.8h, v0.8b, #0
-; CHECK: shl v0.8h, v0.8h, #15
-; CHECK: sshr v0.8h, v0.8h, #15
-; CHECK: shl v1.8h, v1.8h, #15
-; CHECK: sshr v1.8h, v1.8h, #15
- %1 = sext <8 x i1> %v1 to <8 x i16>
- %2 = sext <8 x i1> %v2 to <8 x i16>
- %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
- %4 = trunc <8 x i16> %3 to <8 x i1>
- ret <8 x i1> %4
-}
-
-define <2 x i16> @test_sext_inreg_v2i16i32(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v2i16i32
-; CHECK: sshll v0.4s, v0.4h, #0
-; CHECK-NEXT: uzp1 v0.4s, v0.4s, v0.4s
-; CHECK-NEXT: sshll v1.4s, v1.4h, #0
-; CHECK-NEXT: uzp1 v1.4s, v1.4s, v1.4s
- %1 = sext <2 x i16> %v1 to <2 x i32>
- %2 = sext <2 x i16> %v2 to <2 x i32>
- %3 = shufflevector <2 x i32> %1, <2 x i32> %2, <2 x i32> <i32 0, i32 2>
- %4 = trunc <2 x i32> %3 to <2 x i16>
- ret <2 x i16> %4
-}
-
-define <2 x i16> @test_sext_inreg_v2i16i32_2(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v2i16i32_2
-; CHECK: sshll v0.4s, v0.4h, #0
-; CHECK-NEXT: uzp1 v0.4s, v0.4s, v0.4s
-; CHECK-NEXT: sshll v1.4s, v1.4h, #0
-; CHECK-NEXT: uzp1 v1.4s, v1.4s, v1.4s
- %a1 = shl <2 x i32> %v1, <i32 16, i32 16>
- %a2 = ashr <2 x i32> %a1, <i32 16, i32 16>
- %b1 = shl <2 x i32> %v2, <i32 16, i32 16>
- %b2 = ashr <2 x i32> %b1, <i32 16, i32 16>
- %c = shufflevector <2 x i32> %a2, <2 x i32> %b2, <2 x i32> <i32 0, i32 2>
- %d = trunc <2 x i32> %c to <2 x i16>
- ret <2 x i16> %d
-}
-
-define <2 x i16> @test_sext_inreg_v2i16i64(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v2i16i64
-; CHECK: ushll v1.2d, v1.2s, #0
-; CHECK: ushll v0.2d, v0.2s, #0
-; CHECK: shl v0.2d, v0.2d, #48
-; CHECK: sshr v0.2d, v0.2d, #48
-; CHECK: shl v1.2d, v1.2d, #48
-; CHECK: sshr v1.2d, v1.2d, #48
- %1 = sext <2 x i16> %v1 to <2 x i64>
- %2 = sext <2 x i16> %v2 to <2 x i64>
- %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 0, i32 2>
- %4 = trunc <2 x i64> %3 to <2 x i16>
- ret <2 x i16> %4
-}
-
-define <4 x i16> @test_sext_inreg_v4i16i32(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v4i16i32
-; CHECK: sshll v0.4s, v0.4h, #0
-; CHECK: sshll v1.4s, v1.4h, #0
- %1 = sext <4 x i16> %v1 to <4 x i32>
- %2 = sext <4 x i16> %v2 to <4 x i32>
- %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
- %4 = trunc <4 x i32> %3 to <4 x i16>
- ret <4 x i16> %4
-}
-
-define <2 x i32> @test_sext_inreg_v2i32i64(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone {
-; CHECK-LABEL: test_sext_inreg_v2i32i64
-; CHECK: sshll v0.2d, v0.2s, #0
-; CHECK: sshll v1.2d, v1.2s, #0
- %1 = sext <2 x i32> %v1 to <2 x i64>
- %2 = sext <2 x i32> %v2 to <2 x i64>
- %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 0, i32 2>
- %4 = trunc <2 x i64> %3 to <2 x i32>
- ret <2 x i32> %4
-}
-
diff --git a/test/CodeGen/AArch64/sibling-call.ll b/test/CodeGen/AArch64/sibling-call.ll
index 20f1062..34e3bb4 100644
--- a/test/CodeGen/AArch64/sibling-call.ll
+++ b/test/CodeGen/AArch64/sibling-call.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -aarch64-load-store-opt=0 | FileCheck %s
declare void @callee_stack0()
declare void @callee_stack8([8 x i32], i64)
@@ -73,10 +73,10 @@ define void @caller_to16_from16([8 x i32], i64 %a, i64 %b) {
tail call void @callee_stack16([8 x i32] undef, i64 %b, i64 %a)
ret void
-; CHECK: ldr x0,
-; CHECK: ldr x1,
-; CHECK: str x1,
-; CHECK: str x0,
+; CHECK: ldr [[VAL0:x[0-9]+]],
+; CHECK: ldr [[VAL1:x[0-9]+]],
+; CHECK: str [[VAL1]],
+; CHECK: str [[VAL0]],
; CHECK-NOT: add sp, sp,
; CHECK: b callee_stack16
@@ -91,7 +91,7 @@ define void @indirect_tail() {
%fptr = load void(i32)** @func
tail call void %fptr(i32 42)
ret void
-; CHECK: ldr [[FPTR:x[1-9]+]], [{{x[0-9]+}}, #:lo12:func]
-; CHECK: movz w0, #42
+; CHECK: ldr [[FPTR:x[1-9]+]], [{{x[0-9]+}}, {{#?}}:lo12:func]
+; CHECK: movz w0, #{{42|0x2a}}
; CHECK: br [[FPTR]]
}
diff --git a/test/CodeGen/AArch64/sincos-expansion.ll b/test/CodeGen/AArch64/sincos-expansion.ll
index 4cd4449..c3a172d 100644
--- a/test/CodeGen/AArch64/sincos-expansion.ll
+++ b/test/CodeGen/AArch64/sincos-expansion.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
define float @test_sincos_f32(float %f) {
%sin = call float @sinf(float %f) readnone
diff --git a/test/CodeGen/AArch64/sincospow-vector-expansion.ll b/test/CodeGen/AArch64/sincospow-vector-expansion.ll
index 259a55e..22f33a8 100644
--- a/test/CodeGen/AArch64/sincospow-vector-expansion.ll
+++ b/test/CodeGen/AArch64/sincospow-vector-expansion.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+; RUN: llc -o - %s -verify-machineinstrs -mtriple=aarch64-linux-gnu -mattr=+neon | FileCheck %s
define <2 x float> @test_cos_v2f64(<2 x double> %v1) {
diff --git a/test/CodeGen/AArch64/tail-call.ll b/test/CodeGen/AArch64/tail-call.ll
index 81885f1..8aab842 100644
--- a/test/CodeGen/AArch64/tail-call.ll
+++ b/test/CodeGen/AArch64/tail-call.ll
@@ -7,8 +7,10 @@ declare fastcc void @callee_stack16([8 x i32], i64, i64)
define fastcc void @caller_to0_from0() nounwind {
; CHECK-LABEL: caller_to0_from0:
; CHECK-NEXT: // BB
+
tail call fastcc void @callee_stack0()
ret void
+
; CHECK-NEXT: b callee_stack0
}
@@ -17,6 +19,7 @@ define fastcc void @caller_to0_from8([8 x i32], i64) {
tail call fastcc void @callee_stack0()
ret void
+
; CHECK: add sp, sp, #16
; CHECK-NEXT: b callee_stack0
}
@@ -29,8 +32,8 @@ define fastcc void @caller_to8_from0() {
; pointer (we didn't have arg space to reuse).
tail call fastcc void @callee_stack8([8 x i32] undef, i64 42)
ret void
-; CHECK: str {{x[0-9]+}}, [sp, #16]
-; CHECK-NEXT: add sp, sp, #16
+
+; CHECK: str {{x[0-9]+}}, [sp, #16]!
; CHECK-NEXT: b callee_stack8
}
@@ -41,8 +44,8 @@ define fastcc void @caller_to8_from8([8 x i32], i64 %a) {
; Key point is that the "%a" should go where at SP on entry.
tail call fastcc void @callee_stack8([8 x i32] undef, i64 42)
ret void
-; CHECK: str {{x[0-9]+}}, [sp, #16]
-; CHECK-NEXT: add sp, sp, #16
+
+; CHECK: str {{x[0-9]+}}, [sp, #16]!
; CHECK-NEXT: b callee_stack8
}
@@ -54,10 +57,10 @@ define fastcc void @caller_to16_from8([8 x i32], i64 %a) {
; above %a on the stack. If it tries to go below incoming-SP then the
; callee will not deallocate the space, even in fastcc.
tail call fastcc void @callee_stack16([8 x i32] undef, i64 42, i64 2)
-; CHECK: str {{x[0-9]+}}, [sp, #24]
-; CHECK: str {{x[0-9]+}}, [sp, #16]
-; CHECK: add sp, sp, #16
-; CHECK: b callee_stack16
+
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
+; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: b callee_stack16
ret void
}
@@ -69,8 +72,8 @@ define fastcc void @caller_to8_from24([8 x i32], i64 %a, i64 %b, i64 %c) {
; Key point is that the "%a" should go where at #16 above SP on entry.
tail call fastcc void @callee_stack8([8 x i32] undef, i64 42)
ret void
-; CHECK: str {{x[0-9]+}}, [sp, #32]
-; CHECK-NEXT: add sp, sp, #32
+
+; CHECK: str {{x[0-9]+}}, [sp, #32]!
; CHECK-NEXT: b callee_stack8
}
@@ -84,11 +87,8 @@ define fastcc void @caller_to16_from16([8 x i32], i64 %a, i64 %b) {
tail call fastcc void @callee_stack16([8 x i32] undef, i64 %b, i64 %a)
ret void
-; CHECK: ldr x0,
-; CHECK: ldr x1,
-; CHECK: str x1,
-; CHECK: str x0,
-
-; CHECK: add sp, sp, #16
-; CHECK: b callee_stack16
+; CHECK: ldp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
+; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: b callee_stack16
}
diff --git a/test/CodeGen/AArch64/tst-br.ll b/test/CodeGen/AArch64/tst-br.ll
index 154bc08..8a2fe26 100644
--- a/test/CodeGen/AArch64/tst-br.ll
+++ b/test/CodeGen/AArch64/tst-br.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
; We've got the usual issues with LLVM reordering blocks here. The
; tests are correct for the current order, but who knows when that
@@ -15,7 +15,7 @@ define i32 @test_tbz() {
%tbit0 = and i32 %val, 32768
%tst0 = icmp ne i32 %tbit0, 0
br i1 %tst0, label %test1, label %end1
-; CHECK: tbz {{w[0-9]+}}, #15, [[LBL_end1:.LBB0_[0-9]+]]
+; CHECK: tbz {{w[0-9]+}}, #15, [[LBL_end1:.?LBB0_[0-9]+]]
test1:
%tbit1 = and i32 %val, 4096
@@ -27,22 +27,22 @@ test2:
%tbit2 = and i64 %val64, 32768
%tst2 = icmp ne i64 %tbit2, 0
br i1 %tst2, label %test3, label %end1
-; CHECK: tbz {{x[0-9]+}}, #15, [[LBL_end1]]
+; CHECK: tbz {{[wx][0-9]+}}, #15, [[LBL_end1]]
test3:
%tbit3 = and i64 %val64, 4096
%tst3 = icmp ne i64 %tbit3, 0
br i1 %tst3, label %end2, label %end1
-; CHECK: tbz {{x[0-9]+}}, #12, [[LBL_end1]]
+; CHECK: tbz {{[wx][0-9]+}}, #12, [[LBL_end1]]
end2:
-; CHECK: movz x0, #1
+; CHECK: {{movz x0, #1|orr w0, wzr, #0x1}}
; CHECK-NEXT: ret
ret i32 1
end1:
; CHECK: [[LBL_end1]]:
-; CHECK-NEXT: mov x0, xzr
+; CHECK-NEXT: {{mov x0, xzr|mov w0, wzr}}
; CHECK-NEXT: ret
ret i32 0
}
diff --git a/test/CodeGen/AArch64/variadic.ll b/test/CodeGen/AArch64/variadic.ll
deleted file mode 100644
index 1c7f1e0..0000000
--- a/test/CodeGen/AArch64/variadic.ll
+++ /dev/null
@@ -1,241 +0,0 @@
-; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
-; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 < %s | FileCheck --check-prefix=CHECK-NOFP %s
-
-%va_list = type {i8*, i8*, i8*, i32, i32}
-
-@var = global %va_list zeroinitializer
-
-declare void @llvm.va_start(i8*)
-
-define void @test_simple(i32 %n, ...) {
-; CHECK-LABEL: test_simple:
-; CHECK: sub sp, sp, #[[STACKSIZE:[0-9]+]]
-; CHECK: mov x[[FPRBASE:[0-9]+]], sp
-; CHECK: str q7, [x[[FPRBASE]], #112]
-; CHECK: add x[[GPRBASE:[0-9]+]], sp, #[[GPRFROMSP:[0-9]+]]
-; CHECK: str x7, [x[[GPRBASE]], #48]
-
-; CHECK-NOFP: sub sp, sp, #[[STACKSIZE:[0-9]+]]
-; CHECK-NOFP: add x[[GPRBASE:[0-9]+]], sp, #[[GPRFROMSP:[0-9]+]]
-; CHECK-NOFP: str x7, [x[[GPRBASE]], #48]
-; CHECK-NOFP-NOT: str q7,
-; CHECK-NOFP: str x1, [sp, #[[GPRFROMSP]]]
-
-; Omit the middle ones
-
-; CHECK: str q0, [sp]
-; CHECK: str x1, [sp, #[[GPRFROMSP]]]
-; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
-
-; CHECK-NOFP-NOT: str q0, [sp]
-; CHECK-NOFP: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
-
- %addr = bitcast %va_list* @var to i8*
- call void @llvm.va_start(i8* %addr)
-; CHECK: movn [[VR_OFFS:w[0-9]+]], #127
-; CHECK: str [[VR_OFFS]], [x[[VA_LIST]], #28]
-; CHECK: movn [[GR_OFFS:w[0-9]+]], #55
-; CHECK: str [[GR_OFFS]], [x[[VA_LIST]], #24]
-; CHECK: add [[VR_TOP:x[0-9]+]], x[[FPRBASE]], #128
-; CHECK: str [[VR_TOP]], [x[[VA_LIST]], #16]
-; CHECK: add [[GR_TOP:x[0-9]+]], x[[GPRBASE]], #56
-; CHECK: str [[GR_TOP]], [x[[VA_LIST]], #8]
-; CHECK: add [[STACK:x[0-9]+]], sp, #[[STACKSIZE]]
-; CHECK: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
-
-; CHECK-NOFP: str wzr, [x[[VA_LIST]], #28]
-; CHECK-NOFP: movn [[GR_OFFS:w[0-9]+]], #55
-; CHECK-NOFP: str [[GR_OFFS]], [x[[VA_LIST]], #24]
-; CHECK-NOFP: add [[GR_TOP:x[0-9]+]], x[[GPRBASE]], #56
-; CHECK-NOFP: str [[GR_TOP]], [x[[VA_LIST]], #8]
-; CHECK-NOFP: add [[STACK:x[0-9]+]], sp, #[[STACKSIZE]]
-; CHECK-NOFP: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
-
- ret void
-}
-
-define void @test_fewargs(i32 %n, i32 %n1, i32 %n2, float %m, ...) {
-; CHECK-LABEL: test_fewargs:
-; CHECK: sub sp, sp, #[[STACKSIZE:[0-9]+]]
-; CHECK: mov x[[FPRBASE:[0-9]+]], sp
-; CHECK: str q7, [x[[FPRBASE]], #96]
-; CHECK: add x[[GPRBASE:[0-9]+]], sp, #[[GPRFROMSP:[0-9]+]]
-; CHECK: str x7, [x[[GPRBASE]], #32]
-
-; CHECK-NOFP: sub sp, sp, #[[STACKSIZE:[0-9]+]]
-; CHECK-NOFP-NOT: str q7,
-; CHECK-NOFP: mov x[[GPRBASE:[0-9]+]], sp
-; CHECK-NOFP: str x7, [x[[GPRBASE]], #24]
-
-; Omit the middle ones
-
-; CHECK: str q1, [sp]
-; CHECK: str x3, [sp, #[[GPRFROMSP]]]
-
-; CHECK-NOFP-NOT: str q1, [sp]
-; CHECK-NOFP: str x4, [sp]
-
- %addr = bitcast %va_list* @var to i8*
- call void @llvm.va_start(i8* %addr)
-; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
-; CHECK: movn [[VR_OFFS:w[0-9]+]], #111
-; CHECK: str [[VR_OFFS]], [x[[VA_LIST]], #28]
-; CHECK: movn [[GR_OFFS:w[0-9]+]], #39
-; CHECK: str [[GR_OFFS]], [x[[VA_LIST]], #24]
-; CHECK: add [[VR_TOP:x[0-9]+]], x[[FPRBASE]], #112
-; CHECK: str [[VR_TOP]], [x[[VA_LIST]], #16]
-; CHECK: add [[GR_TOP:x[0-9]+]], x[[GPRBASE]], #40
-; CHECK: str [[GR_TOP]], [x[[VA_LIST]], #8]
-; CHECK: add [[STACK:x[0-9]+]], sp, #[[STACKSIZE]]
-; CHECK: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
-
-; CHECK-NOFP: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
-; CHECK-NOFP: str wzr, [x[[VA_LIST]], #28]
-; CHECK-NOFP: movn [[GR_OFFS:w[0-9]+]], #31
-; CHECK-NOFP: str [[GR_OFFS]], [x[[VA_LIST]], #24]
-; CHECK-NOFP: add [[GR_TOP:x[0-9]+]], x[[GPRBASE]], #32
-; CHECK-NOFP: str [[GR_TOP]], [x[[VA_LIST]], #8]
-; CHECK-NOFP: add [[STACK:x[0-9]+]], sp, #[[STACKSIZE]]
-; CHECK-NOFP: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
-
- ret void
-}
-
-define void @test_nospare([8 x i64], [8 x float], ...) {
-; CHECK-LABEL: test_nospare:
-
- %addr = bitcast %va_list* @var to i8*
- call void @llvm.va_start(i8* %addr)
-; CHECK-NOT: sub sp, sp
-; CHECK: mov [[STACK:x[0-9]+]], sp
-; CHECK: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
-
-; CHECK-NOFP-NOT: sub sp, sp
-; CHECK-NOFP: add [[STACK:x[0-9]+]], sp, #64
-; CHECK-NOFP: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
- ret void
-}
-
-; If there are non-variadic arguments on the stack (here two i64s) then the
-; __stack field should point just past them.
-define void @test_offsetstack([10 x i64], [3 x float], ...) {
-; CHECK-LABEL: test_offsetstack:
-; CHECK: sub sp, sp, #80
-; CHECK: mov x[[FPRBASE:[0-9]+]], sp
-; CHECK: str q7, [x[[FPRBASE]], #64]
-
-; CHECK-NOT: str x{{[0-9]+}},
-
-; CHECK-NOFP-NOT: str q7,
-; CHECK-NOT: str x7,
-
-; Omit the middle ones
-
-; CHECK: str q3, [sp]
-
- %addr = bitcast %va_list* @var to i8*
- call void @llvm.va_start(i8* %addr)
-; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
-; CHECK: movn [[VR_OFFS:w[0-9]+]], #79
-; CHECK: str [[VR_OFFS]], [x[[VA_LIST]], #28]
-; CHECK: str wzr, [x[[VA_LIST]], #24]
-; CHECK: add [[VR_TOP:x[0-9]+]], x[[FPRBASE]], #80
-; CHECK: str [[VR_TOP]], [x[[VA_LIST]], #16]
-; CHECK: add [[STACK:x[0-9]+]], sp, #96
-; CHECK: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
-
-; CHECK-NOFP: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
-; CHECK-NOFP: add [[STACK:x[0-9]+]], sp, #40
-; CHECK-NOFP: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
-; CHECK-NOFP: str wzr, [x[[VA_LIST]], #28]
-; CHECK-NOFP: str wzr, [x[[VA_LIST]], #24]
- ret void
-}
-
-declare void @llvm.va_end(i8*)
-
-define void @test_va_end() nounwind {
-; CHECK-LABEL: test_va_end:
-; CHECK-NEXT: BB#0
-; CHECK-NOFP: BB#0
-
- %addr = bitcast %va_list* @var to i8*
- call void @llvm.va_end(i8* %addr)
-
- ret void
-; CHECK-NEXT: ret
-; CHECK-NOFP-NEXT: ret
-}
-
-declare void @llvm.va_copy(i8* %dest, i8* %src)
-
-@second_list = global %va_list zeroinitializer
-
-define void @test_va_copy() {
-; CHECK-LABEL: test_va_copy:
- %srcaddr = bitcast %va_list* @var to i8*
- %dstaddr = bitcast %va_list* @second_list to i8*
- call void @llvm.va_copy(i8* %dstaddr, i8* %srcaddr)
-
-; Check beginning and end again:
-
-; CHECK: add x[[SRC_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
-; CHECK: add x[[DEST_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:second_list
-; CHECK: ldr [[BLOCK1:x[0-9]+]], [{{x[0-9]+}}, #:lo12:var]
-; CHECK: ldr [[BLOCK2:x[0-9]+]], [x[[SRC_LIST]], #24]
-; CHECK: str [[BLOCK1]], [{{x[0-9]+}}, #:lo12:second_list]
-; CHECK: str [[BLOCK2]], [x[[DEST_LIST]], #24]
-
-; CHECK-NOFP: add x[[SRC_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
-; CHECK-NOFP: add x[[DEST_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:second_list
-; CHECK-NOFP: ldr [[BLOCK1:x[0-9]+]], [{{x[0-9]+}}, #:lo12:var]
-; CHECK-NOFP: ldr [[BLOCK2:x[0-9]+]], [x[[SRC_LIST]], #24]
-; CHECK-NOFP: str [[BLOCK1]], [{{x[0-9]+}}, #:lo12:second_list]
-; CHECK-NOFP: str [[BLOCK2]], [x[[DEST_LIST]], #24]
-
- ret void
-; CHECK: ret
-; CHECK-NOFP: ret
-}
-
-%struct.s_3i = type { i32, i32, i32 }
-
-; This checks that, if the last named argument is not a multiple of 8 bytes,
-; and is allocated on the stack, that __va_list.__stack is initialised to the
-; first 8-byte aligned location above it.
-define void @test_va_odd_struct_on_stack(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, [1 x i64], %struct.s_3i* byval nocapture readnone align 4 %h, ...) {
-; CHECK-LABEL: test_va_odd_struct_on_stack:
-
-; CHECK: sub sp, sp, #128
-; CHECK: mov x[[FPRBASE:[0-9]+]], sp
-; CHECK: str q7, [x[[FPRBASE]], #112]
-
-; CHECK-NOT: str x{{[0-9]+}},
-
-; CHECK-NOFP-NOT: str q7,
-; CHECK-NOT: str x7,
-
-; Omit the middle ones
-
-; CHECK: str q0, [sp]
-
- %addr = bitcast %va_list* @var to i8*
- call void @llvm.va_start(i8* %addr)
-; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
-; CHECK: movn [[VR_OFFS:w[0-9]+]], #127
-; CHECK: str [[VR_OFFS]], [x[[VA_LIST]], #28]
-; CHECK: str wzr, [x[[VA_LIST]], #24]
-; CHECK: add [[VR_TOP:x[0-9]+]], x[[FPRBASE]], #128
-; CHECK: str [[VR_TOP]], [x[[VA_LIST]], #16]
-; This constant would be #140 if it was not 8-byte aligned
-; CHECK: add [[STACK:x[0-9]+]], sp, #144
-; CHECK: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
-
-; CHECK-NOFP: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
-; This constant would be #12 if it was not 8-byte aligned
-; CHECK-NOFP: add [[STACK:x[0-9]+]], sp, #16
-; CHECK-NOFP: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
-; CHECK-NOFP: str wzr, [x[[VA_LIST]], #28]
-; CHECK-NOFP: str wzr, [x[[VA_LIST]], #24]
- ret void
-}
diff --git a/test/CodeGen/AArch64/zero-reg.ll b/test/CodeGen/AArch64/zero-reg.ll
index 9b1e527..bc112ab 100644
--- a/test/CodeGen/AArch64/zero-reg.ll
+++ b/test/CodeGen/AArch64/zero-reg.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
@var32 = global i32 0
@var64 = global i64 0
@@ -7,9 +7,9 @@ define void @test_zr() {
; CHECK-LABEL: test_zr:
store i32 0, i32* @var32
-; CHECK: str wzr, [{{x[0-9]+}}, #:lo12:var32]
+; CHECK: str wzr, [{{x[0-9]+}}, {{#?}}:lo12:var32]
store i64 0, i64* @var64
-; CHECK: str xzr, [{{x[0-9]+}}, #:lo12:var64]
+; CHECK: str xzr, [{{x[0-9]+}}, {{#?}}:lo12:var64]
ret void
; CHECK: ret
@@ -23,8 +23,7 @@ define void @test_sp(i32 %val) {
; instruction (0b11111 in the Rn field would mean "sp").
%addr = getelementptr i32* null, i64 0
store i32 %val, i32* %addr
-; CHECK: mov x[[NULL:[0-9]+]], xzr
-; CHECK: str {{w[0-9]+}}, [x[[NULL]]]
+; CHECK: str {{w[0-9]+}}, [{{x[0-9]+|sp}}]
ret void
; CHECK: ret